|
onnx2versal
|
Vector implementation for Hx4 QLinearConv using int8xint8 MACs, requires data to be arranged in [a,b,c,d,e,f,g,h,i] -> [a,b,c,0, d,e,f,0, g,h,i,0, 0,0,0,0], requires bias to be shifted, i.e. tbias - tw.reshape(M,-1).sum(1) * X_zero_point, requires KW<=4, INP_W%16=0, OUT_W_PAD%16=0, STEP_H==1, STEP_W==1, QLinearConvHx6x8bitStream<28,32,28,32,1,1,1,1,8,3,3,1> total = 3106.
#include <qlinearconv.h>
Public Member Functions | |
| QLinearConvHx6x8bitStream (int32_t(&b)[M], float x_scale, float w_scale, float y_scale, TT x_zero, TTPARAM w_zero, TT y_zero) | |
| void | filter (input_window< TT > *in, input_stream< TTPARAM > *weights, output_stream< TT > *out) |
Static Public Member Functions | |
| static void | registerKernelClass () |
| QLinearConvHx6x8bitStream< TT, TTPARAM, INP_H, INP_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP >::QLinearConvHx6x8bitStream | ( | int32_t(&) | b[M], |
| float | x_scale, | ||
| float | w_scale, | ||
| float | y_scale, | ||
| TT | x_zero, | ||
| TTPARAM | w_zero, | ||
| TT | y_zero | ||
| ) |
QLinearConvHx6x8bitStream<28,32,24,32,1,1,6,5>
https://docs.xilinx.com/r/en-US/ug1079-ai-engine-kernel-coding/MAC-on-8x8-bits 24 selects 4*4=16, (4+2+1)*4=28 => rows (16,18),(17,19),(28,30),(29,30) before square square executes on 4x2 matrix
int8 * int8: Requires: x indexing %4, z indexing %2 expand 5 weights into 16 long vector [0,0,0,0, a,a, b,b, c,c, d,d, e,e, 0,0]
acc0 += x4*z0 + x6*z1 x8*z2 + x10*z3 x12*z4 acc1 += x5*z1 + x7*z2 x9*z3 + x11*z4 x13*z5 acc2 += x0 x2 x4*z2 + x6*z3 x8*z4 + x10*z5 x12*z6 acc3 += x1 x3 x5*z3 + x7*z4 x9*z5 + x11*z6 x13*z7
acc4 += x4*z4 + x6*z5 x8*z6 + x10*z7 x12*z8 acc5 += x5*z5 + x7*z6 x9*z7 + x11*z8 x13*z9 acc6 += x4*z6 + x6*z7 x8*z8 + x10*z9 x12*z10 acc7 += x5*z7 + x7*z8 x9*z9 + x11*z10 x13*z11
acc8 += x4*z8 + x6*z9 x8*z10 + x10*z11 x12*z12 acc9 += x5*z9 + x7*z10 x9*z11 + x11*z12 x13*z13 acc10 += x4*z10 + x6*z11 x8*z12 + x10*z13 x12*z14 acc11 += x5*z11 + x7*z12 x9*z13 + x11*z14 x13*z15
acc12 += x4*z12 + x6*z13 x8*z14 + x10*z15 x12*z16 acc13 += x5*z13 + x7*z14 x9*z15 + x11*z16 x13*z17 acc14 += x4*z14 + x6*z15 x8*z16 + x10*z17 x12*z18 acc15 += x5*z15 + x7*z16 x9*z17 + x11*z18 x13*z19
Vector registers can hold 256 int8 at most, 128 int16 at most.