42template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
46 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
47 static constexpr int C_PER_M = C / GROUP;
49 alignas(32) TTPARAM (&weights)[M*C_PER_M*KH*KW];
50 alignas(32) int32_t (&bias)[M];
62 TTPARAM (&w)[M*C*KH*KW],
70 ): weights(w), bias(b), x_scale(x_scale), w_scale(w_scale), y_scale(y_scale), x_zero(x_zero), w_zero(w_zero), y_zero(y_zero) {
71 scale = x_scale*w_scale/y_scale;
76 output_window<TT>* out
79 static void registerKernelClass() {
80 static_assert((std::is_same<TT, int8_t>::value));
81 REGISTER_FUNCTION(QLinearConvScalar::filter);
82 REGISTER_PARAMETER(weights);
83 REGISTER_PARAMETER(bias);
95template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
99 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
101 alignas(32) TTPARAM (&weights)[M*C*KH*16];
102 alignas(32) int32_t (&bias)[M];
116 TTPARAM (&w)[M*C*KH*16],
127 input_window<TT>* in,
128 output_window<TT>* out
131 static void registerKernelClass() {
132 static_assert((std::is_same<TT, int8_t>::value));
133 static_assert(KH==5);
134 static_assert(KW==5);
135 static_assert(GROUP == 1);
136 static_assert(INP_W%16==0);
137 static_assert(OUT_W_PAD%16==0);
139 REGISTER_PARAMETER(weights);
140 REGISTER_PARAMETER(bias);
152template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
156 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
158 alignas(32) TTPARAM (&weights)[M*C*KH*16];
159 alignas(32) int32_t (&bias)[M];
173 TTPARAM (&w)[M*C*KH*16],
184 input_window<TT>* in,
185 output_window<TT>* out
188 static void registerKernelClass() {
189 static_assert((std::is_same<TT, int8_t>::value));
190 static_assert(GROUP == 1);
191 static_assert(INP_W%16==0);
192 static_assert(OUT_W_PAD%16==0);
193 REGISTER_FUNCTION(QLinearConv5x5Scale32bit::filter);
194 REGISTER_PARAMETER(weights);
195 REGISTER_PARAMETER(bias);
206template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
210 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
212 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;
213 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
215 alignas(32) TTPARAM (&weights)[M*C*16];
216 alignas(32) int32_t (&bias)[M];
230 TTPARAM (&w)[M*C*16],
241 input_window<TT>* in,
242 output_window<TT>* out
245 static void registerKernelClass() {
246 static_assert((std::is_same<TT, int8_t>::value));
247 static_assert(KH==3);
248 static_assert(KW==3);
249 static_assert(GROUP == 1);
250 static_assert(INP_W%16==0);
251 static_assert(OUT_W_PAD%16==0);
252 REGISTER_FUNCTION(QLinearConv3x3::filter);
253 REGISTER_PARAMETER(weights);
254 REGISTER_PARAMETER(bias);
265template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
269 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
270 static constexpr int C_PER_M = C / GROUP;
271 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
273 alignas(32) int32_t (&bias)[M];
274 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
293 ): bias(b), x_scale(x_scale), w_scale(w_scale), y_scale(y_scale), x_zero(x_zero), w_zero(w_zero), y_zero(y_zero) {
294 scale = x_scale*w_scale/y_scale;
298 input_window<TT>* in,
299 input_stream<TTPARAM>* weights,
300 output_stream<TT>* out
303 static void registerKernelClass() {
304 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
305 REGISTER_FUNCTION(QLinearConvScalarStream::filter);
306 REGISTER_PARAMETER(bias);
319template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
323 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
324 static constexpr int C_PER_M = C / GROUP;
325 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
327 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
328 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
331 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
332 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
334 alignas(32) int32_t (&bias)[M];
335 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
359 input_window<TT>* in,
360 input_stream<TTPARAM>* weights,
361 output_stream<TT>* out
364 static void registerKernelClass() {
365 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
366 static_assert(KW<=4);
367 static_assert(INP_W%16==0);
368 static_assert(OUT_W_PAD%16==0);
369 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
370 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
371 REGISTER_FUNCTION(QLinearConvHx4Stream::filter);
372 REGISTER_PARAMETER(bias);
385template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
389 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
390 static constexpr int C_PER_M = C / GROUP;
391 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
393 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
394 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
397 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
398 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
400 alignas(32) int32_t (&bias)[M];
401 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
425 input_window<TT>* in,
426 input_stream<TTPARAM>* weights,
427 output_stream<TT>* out
430 static void registerKernelClass() {
431 static_assert((std::is_same<TT, int8_t>::value));
432 static_assert(KW<=4);
433 static_assert(INP_W%16==0);
434 static_assert(OUT_W_PAD%16==0);
435 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
436 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
437 REGISTER_FUNCTION(QLinearConvHx4StreamScale32bit::filter);
438 REGISTER_PARAMETER(bias);
449template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
453 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
454 static constexpr int C_PER_M = C / GROUP;
455 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
456 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
458 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
459 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
462 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
463 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
465 alignas(32) int32_t (&bias)[M];
466 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
467 alignas(32) TT in[INP_SIZE];
491 input_pktstream* in_s,
492 input_stream<TTPARAM>* weights,
493 output_stream<TT>* out
496 static void registerKernelClass() {
497 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
498 static_assert(KW<=4);
499 static_assert(INP_W%16==0);
500 static_assert(OUT_W_PAD%16==0);
501 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
502 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
503 REGISTER_FUNCTION(QLinearConvHx4PktStream::filter);
504 REGISTER_PARAMETER(bias);
509template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
510class QLinearConvHx4_0 {
513 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
514 static constexpr int C_PER_M = C / GROUP;
515 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
517 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
518 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
521 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
522 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
524 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
525 alignas(32) int32_t (&bias)[M];
530 TTPARAM (&w)[M*CKK_ROW_SIZE],
533 ): weights(w), bias(b), w_zero(w_zero) {}
536 input_window<TT>* in,
537 output_stream<acc48>* cout
540 static void registerKernelClass() {
541 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
542 static_assert(KW<=4);
543 static_assert(INP_W%16==0);
544 static_assert(OUT_W_PAD%16==0);
545 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
546 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
547 REGISTER_FUNCTION(QLinearConvHx4_0::filter);
548 REGISTER_PARAMETER(weights);
549 REGISTER_PARAMETER(bias);
554template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
555class QLinearConvHx4_1 {
558 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
559 static constexpr int C_PER_M = C / GROUP;
560 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
562 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
563 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
566 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
567 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
569 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
574 TTPARAM (&w)[M*CKK_ROW_SIZE],
576 ): weights(w), w_zero(w_zero) {}
579 input_window<TT>* in,
580 input_stream<acc48>* cin,
581 output_stream<acc48>* cout
584 static void registerKernelClass() {
585 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
586 static_assert(KW<=4);
587 static_assert(INP_W%16==0);
588 static_assert(OUT_W_PAD%16==0);
589 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
590 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
591 REGISTER_FUNCTION(QLinearConvHx4_1::filter);
592 REGISTER_PARAMETER(weights);
597template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
598class QLinearConvHx4_2 {
601 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
602 static constexpr int C_PER_M = C / GROUP;
603 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
605 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
606 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
609 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
610 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
612 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
626 TTPARAM (&w)[M*CKK_ROW_SIZE],
636 input_window<TT>* in,
637 input_stream<acc48>* cin,
638 output_stream<TT>* out
641 static void registerKernelClass() {
642 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
643 static_assert(KW<=4);
644 static_assert(INP_W%16==0);
645 static_assert(OUT_W_PAD%16==0);
646 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
647 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
648 REGISTER_FUNCTION(QLinearConvHx4_2::filter);
649 REGISTER_PARAMETER(weights);
654template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
655class QLinearConvHx4Stream_0 {
658 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
659 static constexpr int C_PER_M = C / GROUP;
660 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
661 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
663 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
664 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
667 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
668 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
670 alignas(32) int32_t (&bias)[M];
671 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
672 alignas(32) TT in[INP_SIZE];
685 QLinearConvHx4Stream_0 (
688 ): bias(b), w_zero(w_zero) {}
691 input_pktstream* in_s,
692 input_stream<TTPARAM>* weights,
693 output_stream<acc48>* cout
696 static void registerKernelClass() {
697 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
698 static_assert(KW<=4);
699 static_assert(INP_W%16==0);
700 static_assert(OUT_W_PAD%16==0);
701 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
702 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
703 REGISTER_FUNCTION(QLinearConvHx4Stream_0::filter);
704 REGISTER_PARAMETER(bias);
709template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
710class QLinearConvHx4Stream_1 {
713 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
714 static constexpr int C_PER_M = C / GROUP;
715 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
716 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
718 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
719 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
722 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
723 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
725 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
726 alignas(32) TT in[INP_SIZE];
739 QLinearConvHx4Stream_1 (
744 input_pktstream* in_s,
745 input_stream<TTPARAM>* weights,
746 input_stream<acc48>* cin,
747 output_stream<acc48>* cout
750 static void registerKernelClass() {
751 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
752 static_assert(KW<=4);
753 static_assert(INP_W%16==0);
754 static_assert(OUT_W_PAD%16==0);
755 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
756 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
757 REGISTER_FUNCTION(QLinearConvHx4Stream_1::filter);
762template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
763class QLinearConvHx4Stream_2 {
766 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
767 static constexpr int C_PER_M = C / GROUP;
768 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
769 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
771 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);
772 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
775 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
776 static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;
778 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
779 alignas(32) TT in[INP_SIZE];
792 QLinearConvHx4Stream_2 (
802 input_pktstream* in_s,
803 input_stream<TTPARAM>* weights,
804 input_stream<acc48>* cin,
805 output_stream<TT>* out
808 static void registerKernelClass() {
809 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
810 static_assert(KW<=4);
811 static_assert(INP_W%16==0);
812 static_assert(OUT_W_PAD%16==0);
813 static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);
814 static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);
815 REGISTER_FUNCTION(QLinearConvHx4Stream_2::filter);
827template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
831 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
832 static constexpr int C_PER_M = C / GROUP;
833 static constexpr int CKK_ROW_SIZE = C_PER_M*KH*16;
835 alignas(32) int32_t (&bias)[M];
836 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
860 input_window<TT>* in,
861 input_stream<TTPARAM>* weights,
862 output_stream<TT>* out
865 static void registerKernelClass() {
866 static_assert((std::is_same<TT, int8_t>::value));
867 static_assert(KW<=6);
868 static_assert(INP_W%16==0);
869 static_assert(OUT_W_PAD%16==0);
870 static_assert(STEP_H == 1);
871 static_assert(STEP_W == 1);
872 REGISTER_FUNCTION(QLinearConvHx6x8bitStream::filter);
873 REGISTER_PARAMETER(bias);
883template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
887 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
888 static constexpr int C_PER_M = C / GROUP;
889 static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);
891 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;
892 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
895 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
897 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
898 alignas(32) int32_t (&bias)[M];
912 TTPARAM (&w)[M*CKK_ROW_SIZE],
923 input_window<TT>* in,
924 output_stream<TT>* out
927 static void registerKernelClass() {
928 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
929 static_assert(KW<=8);
930 static_assert(INP_W%16==0);
931 static_assert(OUT_W_PAD%16==0);
932 static_assert(STEP_H == 1 || STEP_H == 2);
933 static_assert(STEP_W == 1 || STEP_W == 2);
934 REGISTER_FUNCTION(QLinearConvHx8::filter);
935 REGISTER_PARAMETER(weights);
936 REGISTER_PARAMETER(bias);
946template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
950 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
951 static constexpr int CKK_ROW_SIZE = C*((KH*KW+15)/16*16);
952 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
954 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;
955 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;
958 static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;
960 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
961 alignas(32) int32_t (&bias)[M];
968 alignas(32) TT in[INP_SIZE];
976 TTPARAM (&w)[M*CKK_ROW_SIZE],
988 output_stream<TT>* out
991 static void registerKernelClass() {
992 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
993 static_assert(KW<=8);
994 static_assert(INP_W%16==0);
995 static_assert(OUT_W_PAD%16==0);
996 static_assert(STEP_H == 1 || STEP_H == 2);
997 static_assert(STEP_W == 1 || STEP_W == 2);
998 static_assert(GROUP == 1);
999 REGISTER_FUNCTION(QLinearConvHx8PktStream::filter);
1000 REGISTER_PARAMETER(weights);
1001 REGISTER_PARAMETER(bias);
1014template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
1018 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
1019 static constexpr int C_PER_M = C / GROUP;
1020 static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;
1021 static constexpr int LAST_C = (C_PER_M % 16 - 1) /2*2;
1023 static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;
1024 static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x3120 : 0x3210;
1026 alignas(32) int32_t (&bias)[M];
1027 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
1051 input_window<TT>* in,
1052 input_stream<TTPARAM>* weights,
1053 output_stream<TT>* out
1056 static void registerKernelClass() {
1057 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
1058 static_assert(KH==1);
1059 static_assert(KW==1);
1060 static_assert(INP_W%16==0);
1061 static_assert(OUT_W_PAD%16==0);
1062 static_assert(STEP_H == 1 || STEP_H == 2);
1063 static_assert(STEP_W == 1 || STEP_W == 2);
1064 REGISTER_FUNCTION(QLinearConv1x1Stream::filter);
1065 REGISTER_PARAMETER(bias);
1076template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
1080 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
1081 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
1082 static constexpr int C_PER_M = C / GROUP;
1083 static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;
1084 static constexpr int LAST_C = (C_PER_M % 16 - 1) /2*2;
1086 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
1087 alignas(32) int32_t (&bias)[M];
1088 alignas(32) TT in[INP_SIZE];
1103 TTPARAM (&w)[M*CKK_ROW_SIZE],
1114 input_pktstream* in_s,
1115 output_stream<TT>* out
1118 static void registerKernelClass() {
1119 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
1120 static_assert(KH==1);
1121 static_assert(KW==1);
1122 static_assert(INP_W%16==0);
1123 static_assert(OUT_W_PAD%16==0);
1124 static_assert(STEP_H == 1 || STEP_H == 2);
1125 static_assert(STEP_W == 1 || STEP_W == 2);
1126 REGISTER_FUNCTION(QLinearConv1x1InputPackets::filter);
1127 REGISTER_PARAMETER(weights);
1128 REGISTER_PARAMETER(bias);
1139template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
1143 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
1144 static constexpr int INP_SIZE = B*C*INP_H*INP_W;
1145 static constexpr int C_PER_M = C / GROUP;
1146 static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;
1147 static constexpr int LAST_C = (C_PER_M % 16 - 1) /2*2;
1149 alignas(32) int32_t (&bias)[M];
1150 alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];
1151 alignas(32) TT in[INP_SIZE];
1176 input_pktstream* in_s,
1177 input_stream<TTPARAM>* weights,
1178 output_stream<TT>* out
1181 static void registerKernelClass() {
1182 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
1183 static_assert(KH==1);
1184 static_assert(KW==1);
1185 static_assert(INP_W%16==0);
1186 static_assert(OUT_W_PAD%16==0);
1187 static_assert(STEP_H == 1 || STEP_H == 2);
1188 static_assert(STEP_W == 1 || STEP_W == 2);
1189 REGISTER_FUNCTION(QLinearConv1x1StreamInputPackets::filter);
1190 REGISTER_PARAMETER(bias);
1195template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
1196class QLinearConv1x1_0 {
1199 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
1200 static constexpr int C_PER_M = C / GROUP;
1201 static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;
1202 static constexpr int LAST_C = (C_PER_M % 16) / 2;
1204 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
1205 alignas(32) int32_t (&bias)[M];
1210 TTPARAM (&w)[M*CKK_ROW_SIZE],
1213 ): weights(w), bias(b), w_zero(w_zero) {}
1216 input_window<TT>* in,
1217 output_stream<acc48>* cout
1220 static void registerKernelClass() {
1221 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
1222 static_assert(KH==1);
1223 static_assert(KW==1);
1224 static_assert(INP_W%16==0);
1225 static_assert(OUT_W_PAD%16==0);
1226 static_assert(STEP_H == 1 || STEP_H == 2);
1227 static_assert(STEP_W == 1 || STEP_W == 2);
1228 REGISTER_FUNCTION(QLinearConv1x1_0::filter);
1229 REGISTER_PARAMETER(weights);
1230 REGISTER_PARAMETER(bias);
1235template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
1236class QLinearConv1x1_1 {
1239 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
1240 static constexpr int C_PER_M = C / GROUP;
1241 static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;
1242 static constexpr int LAST_C = (C_PER_M % 16) / 2;
1244 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
1249 TTPARAM (&w)[M*CKK_ROW_SIZE],
1251 ): weights(w), w_zero(w_zero) {}
1254 input_window<TT>* in,
1255 input_stream<acc48>* cin,
1256 output_stream<acc48>* cout
1259 static void registerKernelClass() {
1260 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
1261 static_assert(KH==1);
1262 static_assert(KW==1);
1263 static_assert(INP_W%16==0);
1264 static_assert(OUT_W_PAD%16==0);
1265 static_assert(STEP_H == 1 || STEP_H == 2);
1266 static_assert(STEP_W == 1 || STEP_W == 2);
1267 REGISTER_FUNCTION(QLinearConv1x1_1::filter);
1268 REGISTER_PARAMETER(weights);
1273template <
typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
int B,
int C,
int M,
int KH,
int KW,
int GROUP>
1274class QLinearConv1x1_2 {
1277 static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;
1278 static constexpr int C_PER_M = C / GROUP;
1279 static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;
1280 static constexpr int LAST_C = (C_PER_M % 16) / 2;
1282 alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];
1296 TTPARAM (&w)[M*CKK_ROW_SIZE],
1306 input_window<TT>* in,
1307 input_stream<acc48>* cin,
1308 output_stream<TT>* out
1311 static void registerKernelClass() {
1312 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
1313 static_assert(KH==1);
1314 static_assert(KW==1);
1315 static_assert(INP_W%16==0);
1316 static_assert(OUT_W_PAD%16==0);
1317 static_assert(STEP_H == 1 || STEP_H == 2);
1318 static_assert(STEP_W == 1 || STEP_W == 2);
1319 REGISTER_FUNCTION(QLinearConv1x1_2::filter);
1320 REGISTER_PARAMETER(weights);
Vector implementation for 1x1 QLinearConv, requires data to be reshaped from (M,C,...
Definition qlinearconv.h:1015
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:207
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:153
Vector implementation for QLinearConv 5x5, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:96
void filter(input_window< TT > *in, output_window< TT > *out)
Definition qlinearconv.cc:134
Vector implementation for Hx4 QLinearConv, padding with y_zero, requires data to be arranged in (M,...
Definition qlinearconv.h:450
Vector implementation for Hx4 QLinearConv using 32bit scale for precision, requires data to be arrang...
Definition qlinearconv.h:386
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:320
Vector implementation for Hx4 QLinearConv using int8xint8 MACs, requires data to be arranged in [a,...
Definition qlinearconv.h:828
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:947
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:884
Scalar implementation streaming weights, requires weights stream to be padded from MxCxKxK to MxCx16,...
Definition qlinearconv.h:266
Scalar implementation, QLinearConvScalar<30,32,28,32,1,1,1,1,6,5> total = 1282213,...
Definition qlinearconv.h:43