|
onnx2versal
|
Vector implementation for QLinearConv 5x5, requires data to be arranged in [a,b,c,d,e] -> [0,0,0,0,a,a,b,b,c,c,d,d,e,e,0,0], requires bias to be shifted, i.e. tbias - tw.reshape(M,-1).sum(1) * X_zero_point, requires INP_W%16=0, OUT_W_PAD%16=0, QLinearConv5x5<30,32,28,32,1,1,1,1,6,5> total = 3513.
#include <qlinearconv.h>
Public Member Functions | |
| QLinearConv5x5 (TTPARAM(&w)[M *C *KH *16], int32_t(&b)[M], float x_scale, float w_scale, float y_scale, TT x_zero, TTPARAM w_zero, TT y_zero) | |
| void | filter (input_window< TT > *in, output_window< TT > *out) |
Static Public Member Functions | |
| static void | registerKernelClass () |
| void QLinearConv5x5< TT, TTPARAM, INP_H, INP_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP >::filter | ( | input_window< TT > * | in, |
| output_window< TT > * | out | ||
| ) |
QLinearConv5x5<28,32,24,32,1,1,6,5>
https://docs.xilinx.com/r/en-US/ug1079-ai-engine-kernel-coding/MAC-on-8x8-bits 24 selects 4*4=16, (4+2+1)*4=28 => rows (16,18),(17,19),(28,30),(29,30) before square square executes on 4x2 matrix
int8 * int8: Requires: x indexing %4, z indexing %2 expand 5 weights into 16 long vector [0,0,0,0, a,a, b,b, c,c, d,d, e,e, 0,0]
acc0 += x4*z0 + x6*z1 x8*z2 + x10*z3 x12*z4 acc1 += x5*z1 + x7*z2 x9*z3 + x11*z4 x13*z5 acc2 += x0 x2 x4*z2 + x6*z3 x8*z4 + x10*z5 x12*z6 acc3 += x1 x3 x5*z3 + x7*z4 x9*z5 + x11*z6 x13*z7
acc4 += x4*z4 + x6*z5 x8*z6 + x10*z7 x12*z8 acc5 += x5*z5 + x7*z6 x9*z7 + x11*z8 x13*z9 acc6 += x4*z6 + x6*z7 x8*z8 + x10*z9 x12*z10 acc7 += x5*z7 + x7*z8 x9*z9 + x11*z10 x13*z11
acc8 += x4*z8 + x6*z9 x8*z10 + x10*z11 x12*z12 acc9 += x5*z9 + x7*z10 x9*z11 + x11*z12 x13*z13 acc10 += x4*z10 + x6*z11 x8*z12 + x10*z13 x12*z14 acc11 += x5*z11 + x7*z12 x9*z13 + x11*z14 x13*z15
acc12 += x4*z12 + x6*z13 x8*z14 + x10*z15 x12*z16 acc13 += x5*z13 + x7*z14 x9*z15 + x11*z16 x13*z17 acc14 += x4*z14 + x6*z15 x8*z16 + x10*z17 x12*z18 acc15 += x5*z15 + x7*z16 x9*z17 + x11*z18 x13*z19
Vector registers can hold 256 int8 at most, 128 int16 at most.