1#ifndef __QLINEARCONV_GRAPH_H__
2#define __QLINEARCONV_GRAPH_H__
5#include "qlinearconv.h"
8#include "graph_concat.h"
9#include "graph_split.h"
10#include "graph_utils.h"
13template <
template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV,
14 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
15 int B,
int C,
int M,
int KH,
int KW,
int GROUP>
16void set_heap_size(adf::kernel k) {
19 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
20 QLinearConvScalarStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
22 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
23 QLinearConvHx4Stream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
25 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
26 QLinearConvHx4StreamScale32bit<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
28 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
29 QLinearConvHx6x8bitStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value)
31 adf::heap_size(k) = C/GROUP*((KH*KW+15)/16*16) + 1024;
35 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
36 QLinearConvHx4PktStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
38 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
39 QLinearConvHx4Stream_0<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
41 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
42 QLinearConvHx4Stream_1<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
44 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
45 QLinearConvHx4Stream_2<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
47 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
48 QLinearConv1x1InputPackets<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
50 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
51 QLinearConv1x1StreamInputPackets<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
53 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
54 QLinearConvHx8PktStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value)
56 adf::heap_size(k) = 31712;
104 template<
typename,
int,
int,
int,
int,
int,
int,
int,
int>
class PAD,
105 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV,
106 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
107 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
108 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
113 std::vector<adf::kernel> pad;
114 static constexpr int PAD_H = INP_H + H0 + H1;
115 static constexpr int PAD_W = INP_W + W0 + W1;
118 adf::port<input> pin[1];
119 adf::port<output> pout[1];
120 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
123 std::vector<TTPARAM> weights,
124 std::vector<int32_t> bias,
132 static_assert(B*C*PAD_H*PAD_W <= MAX_PARAM_BYTES);
133 assert(weights.size() <= MAX_PARAM_BYTES);
134 static_assert(B*M*OUT_H*OUT_W_PAD <= MAX_PARAM_BYTES);
136 k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
137 weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
138 adf::source(k[0]) =
"qlinearconv.cc";
139 adf::headers(k[0]) = {
"qlinearconv.h"};
140 adf::runtime<ratio>(k[0]) = 0.6;
142 if (H0+H1+W0+W1 != 0) {
144 adf::kernel::create_object<PAD<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));
145 adf::source(pad[0]) =
"pad.cc";
146 adf::headers(pad[0]) = {
"pad.h"};
147 adf::runtime<ratio>(pad[0]) = 0.6;
149 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
150 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W>> (pad[0].out[0], k[0].in[0]);
152 adf::connect<adf::window<B*C*INP_H*INP_W_PAD>> (pin[0], k[0].in[0]);
155 adf::connect<adf::window<B*M*OUT_H*OUT_W_PAD>> (k[0].out[0], pout[0]);
157 adf::location_constraint tilePos = adf::location<adf::kernel>(k[0]);
158 adf::location<adf::parameter>(k[0].param[0]) = tilePos;
159 adf::location<adf::parameter>(k[0].param[0]) = adf::offset(0);
160 adf::location<adf::parameter>(k[0].param[1]) = tilePos;
176 template<
typename,
int,
int,
int,
int,
int,
int,
int,
int>
class PAD,
177 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV,
178 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
179 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
180 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
185 std::vector<adf::kernel> pad;
186 static constexpr int PAD_H = INP_H + H0 + H1;
187 static constexpr int PAD_W = INP_W + W0 + W1;
190 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
192 adf::vector<adf::port<input>> pin;
193 adf::port<output> pout[1];
195 void init_helper(TT x_zero) {
196 adf::source(k[0]) =
"qlinearconv.cc";
197 adf::headers(k[0]) = {
"qlinearconv.h"};
198 adf::runtime<ratio>(k[0]) = 0.6;
199 if (B*C*PAD_H*PAD_W > MAX_PARAM_BYTES)
200 adf::single_buffer(k[0].in[0]);
202 set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[0]);
204 if (H0+H1+W0+W1 != 0) {
206 adf::kernel::create_object<PAD<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));
207 adf::source(pad[0]) =
"pad.cc";
208 adf::headers(pad[0]) = {
"pad.h"};
209 adf::runtime<ratio>(pad[0]) = 0.6;
211 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
212 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W>> (pad[0].out[0], k[0].in[0]);
214 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
215 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
217 adf::location<adf::kernel>(pad[0]) = adf::location<adf::kernel>(k[0]) +
218 adf::relative_offset({.col_offset=0, .row_offset=1});
220 adf::location_constraint padTile = adf::location<adf::kernel>(pad[0]);
221 adf::location<adf::stack>(pad[0]) = padTile;
222 adf::location<adf::stack>(k[0]) = padTile;
223 adf::location<adf::parameter>(k[0].param[0]) = padTile;
225 adf::connect<adf::window<B*C*INP_H*INP_W_PAD>> (pin[0], k[0].in[0]);
228 adf::connect<adf::stream> (k[0].out[0], pout[0]);
229 adf::samples_per_iteration(k[0].out[0]) = B*M*OUT_H*OUT_W_PAD;
231 if (B*C*PAD_H*PAD_W > MAX_PARAM_BYTES) {
232 adf::location<adf::buffer>(k[0].in[0]) = {adf::offset(0)};
234 adf::location<adf::buffer>(k[0].in[0]) = {adf::offset(0), adf::offset(16384)};
239 std::vector<int32_t> bias,
247 static_assert(B*C*PAD_H*PAD_W <= TILE_BYTES);
248 k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
249 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
251 adf::port<adf::input> pin0;
252 adf::port<adf::input> pin1;
256 adf::connect<adf::stream> (pin[1], k[0].in[1]);
262 std::vector<TTPARAM> weights,
263 std::vector<int32_t> bias,
271 static_assert(B*C*PAD_H*PAD_W <= TILE_BYTES);
273 k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
274 weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
276 adf::port<adf::input> pin0;
295 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV,
296 template<
typename,
int,
int,
int,
int>
class CONCAT,
298 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
299 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
300 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
304 static constexpr int PAD_H = INP_H + H0 + H1;
305 static constexpr int PAD_W = INP_W + W0 + W1;
307 std::vector<adf::kernel> pad;
309 static constexpr int OVERLAP = KH-STEP_H;
312 static constexpr int LCNT = mSplitGraph::LCNT;
316 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
317 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
318 ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
320 adf::relative_coordinate tileOffsets[8] = {
321 {.col_offset = -1, .row_offset = 1},
322 {.col_offset = 0, .row_offset = 2},
323 {.col_offset = 0, .row_offset = 1},
324 {.col_offset = 1, .row_offset = 0},
325 {.col_offset = 0, .row_offset = -1},
326 {.col_offset = 0, .row_offset = -2},
327 {.col_offset = -1, .row_offset = -1},
328 {.col_offset = -1, .row_offset = 0},
331 adf::relative_coordinate concat_k1_offsets[4] = {
332 {.col_offset = -1, .row_offset = 2},
333 {.col_offset = 1, .row_offset = 1},
334 {.col_offset = 1, .row_offset = -1},
335 {.col_offset = -1, .row_offset = -2},
339 adf::port<adf::input> pin[2];
340 adf::port<adf::output> pout[1];
343 std::vector<int32_t> bias,
351 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
352 static_assert(LCNT <= 8);
353 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
355 if (H0+H1+W0+W1 != 0) {
358 adf::source(pad[0]) =
"pad.cc";
359 adf::headers(pad[0]) = {
"pad.h"};
360 adf::runtime<ratio>(pad[0]) = 0.6;
362 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
363 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
365 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
366 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
369 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
372 for (
int i = 0; i < LCNT; i++) {
373 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
374 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
375 adf::source(k[i]) =
"qlinearconv.cc";
376 adf::headers(k[i]) = {
"qlinearconv.h"};
377 adf::runtime<ratio>(k[i]) = 0.6;
378 if (B*C*HCHUNK*PAD_W > MAX_PARAM_BYTES)
379 adf::single_buffer(k[i].in[0]);
381 set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);
383 adf::connect<adf::window<B*C*HCHUNK*PAD_W>> (split_graph.pout[i], k[i].in[0]);
384 adf::connect<adf::stream> (pin[1], k[i].in[1]);
385 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
387 adf::location<adf::kernel>(k[i]) =
388 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(tileOffsets[i]);
389 adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);
390 adf::location<adf::parameter>(k[i].param[0]) = tilePos;
391 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
393 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
395 for (
int i = 0; i < concat_graph.k1.size(); i++) {
396 adf::location<adf::kernel>(concat_graph.k1[i]) =
397 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(concat_k1_offsets[i]);
415 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV,
416 template<
typename,
int,
int,
int,
int>
class CONCAT,
418 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
419 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
420 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
424 static constexpr int PAD_H = INP_H + H0 + H1;
425 static constexpr int PAD_W = INP_W + W0 + W1;
427 std::vector<adf::kernel> pad;
429 static constexpr int OVERLAP = KH-STEP_H;
430 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
431 adf::kernel split[(LCNT+1)/2];
434 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
435 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
436 ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
439 adf::port<adf::input> pin[2];
440 adf::port<adf::output> pout[1];
443 std::vector<int32_t> bias,
451 static_assert((HCHUNK - KH + 1) % STEP_H == 0);
452 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
454 for (
int i = 0; i < LCNT/2; i++) {
455 split[i] = adf::kernel::create_object<SplitFilterInt8StreamTwice<TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(i*2);
456 adf::source(split[i]) =
"split.cc";
457 adf::headers(split[i]) = {
"split.h"};
458 adf::runtime<ratio>(split[i]) = 0.6;
460 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
461 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
462 adf::samples_per_iteration(split[i].out[1]) = B*C*HCHUNK*PAD_W;
464 if ((LCNT & 0x1) == 1) {
465 int i = (LCNT+1)/2 - 1;
466 split[i] = adf::kernel::create_object<SplitFilterInt8Stream<TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(LCNT-1);
467 adf::source(split[i]) =
"split.cc";
468 adf::headers(split[i]) = {
"split.h"};
469 adf::runtime<ratio>(split[i]) = 0.6;
471 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
472 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
475 for (
int i = 0; i < LCNT; i++) {
476 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
477 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
478 adf::source(k[i]) =
"qlinearconv.cc";
479 adf::headers(k[i]) = {
"qlinearconv.h"};
480 adf::runtime<ratio>(k[i]) = 0.6;
481 if (B*C*HCHUNK*PAD_W > MAX_PARAM_BYTES)
482 adf::single_buffer(k[i].in[0]);
484 set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);
486 adf::connect<adf::window<B*C*HCHUNK*PAD_W>> (split[i/2].out[i&0x1], k[i].in[0]);
487 adf::connect<adf::stream> (pin[1], k[i].in[1]);
488 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
490 if ((i & 0x1) != 0) {
491 adf::location<adf::kernel>(k[i]) =
492 adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=1, .row_offset=0});
493 adf::location<adf::kernel>(split[i/2]) =
494 adf::location<adf::kernel>(k[i]) + adf::relative_offset({.col_offset=0, .row_offset=-1});
496 adf::location_constraint sTilePos = adf::location<adf::kernel>(split[i/2]);
497 adf::location<adf::stack>(split[i/2]) = sTilePos;
498 adf::location<adf::stack>(k[i]) = sTilePos;
499 adf::location<adf::parameter>(k[i].param[0]) = sTilePos;
500 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
503 adf::location_constraint kTilePos = adf::location<adf::kernel>(k[i]);
505 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
507 if (H0+H1+W0+W1 != 0) {
510 adf::source(pad[0]) =
"pad.cc";
511 adf::headers(pad[0]) = {
"pad.h"};
512 adf::runtime<ratio>(pad[0]) = 0.6;
514 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
515 for (
int i = 0; i < (LCNT+1)/2; i++)
516 adf::connect<adf::stream> (pad[0].out[0], split[i].in[0]);
518 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
519 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
522 for (
int i = 0; i < (LCNT+1)/2; i++)
523 adf::connect<adf::stream> (pin[0], split[i].in[0]);
526 for (
int i = 0; i < concat_graph.k1.size(); i++) {
527 adf::location<adf::kernel>(concat_graph.k1[i]) =
528 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
530 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
531 adf::location<adf::parameter>(k[i*2].param[0]) = cTilePos;
532 adf::location<adf::parameter>(k[i*2].param[0]) = adf::offset(0);
533 adf::location<adf::stack>(k[i*2]) = cTilePos;
534 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
551 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV,
552 template<
typename,
int,
int,
int,
int>
class CONCAT,
554 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
555 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
556 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
560 static constexpr int PAD_H = INP_H + H0 + H1;
561 static constexpr int PAD_W = INP_W + W0 + W1;
563 std::vector<adf::kernel> pad;
565 static constexpr int OVERLAP = KH-STEP_H;
569 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
572 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
573 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
574 ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
577 std::vector<adf::port<adf::input>> pin;
578 adf::port<adf::output> pout[1];
580 void init_helper(TT x_zero) {
581 for (
int i = 0; i < LCNT; i++) {
582 adf::source(k[i]) =
"qlinearconv.cc";
583 adf::headers(k[i]) = {
"qlinearconv.h"};
584 adf::runtime<ratio>(k[i]) = 0.6;
586 set_heap_size<QLINEARCONV,TT,TTPARAM,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);
588 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
589 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
590 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
593 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
595 if (H0+H1+W0+W1 != 0) {
598 adf::source(pad[0]) =
"pad.cc";
599 adf::headers(pad[0]) = {
"pad.h"};
600 adf::runtime<ratio>(pad[0]) = 0.6;
602 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
603 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
605 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
606 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
609 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
613 for (
int i = 0; i < LCNT; i++) {
615 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=2});
617 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=-1, .row_offset=2});
618 if (i == 2 || i == 6)
619 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-2]) + adf::relative_offset({.col_offset=1, .row_offset=0});
620 adf::location<adf::stack>(k[i]) = adf::location<adf::kernel>(k[i]);
623 for (
int i = 0; i < concat_graph.k1.size(); i++) {
624 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
625 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
628 adf::location<adf::kernel>(concat_graph.k1[i]) =
629 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
630 adf::location<adf::parameter>(k[i*2].param[0]) = cTilePos;
632 adf::location<adf::kernel>(concat_graph.k1[i]) =
633 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
634 adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;
637 adf::location<adf::kernel>(split_graph.k[0]) = adf::location<adf::kernel>(k[1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
641 std::vector<int32_t> bias,
649 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
650 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
652 adf::port<adf::input> pin0;
653 adf::port<adf::input> pin1;
657 for (
int i = 0; i < LCNT; i++) {
658 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
659 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
660 adf::connect<adf::stream> (pin[1], k[i].in[1]);
666 std::vector<TTPARAM> weights,
667 std::vector<int32_t> bias,
675 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
676 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
678 adf::port<adf::input> pin0;
681 for (
int i = 0; i < LCNT; i++) {
682 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
683 weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
701 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV0,
702 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV1,
703 template<
typename,
typename,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class QLINEARCONV2,
704 template<
typename,
int,
int,
int,
int>
class CONCAT,
706 typename TT,
typename TTPARAM,
int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
707 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
708 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
712 static constexpr int PAD_H = INP_H + H0 + H1;
713 static constexpr int PAD_W = INP_W + W0 + W1;
715 std::vector<adf::kernel> pad;
720 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
723 static constexpr int LCNT = C / CCHUNK;
726 std::vector<adf::port<adf::input>> pin;
727 adf::port<adf::output> pout[1];
729 void init_helper(TT x_zero) {
730 static_assert(LCNT >= 3);
731 static_assert(C % CCHUNK == 0);
732 static_assert(B*CCHUNK*PAD_H*PAD_W <= TILE_BYTES);
734 for (
int i = 0; i < LCNT; i++) {
735 adf::source(k[i]) =
"qlinearconv.cc";
736 adf::headers(k[i]) = {
"qlinearconv.h"};
737 adf::runtime<ratio>(k[i]) = 0.6;
740 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=1});
744 if (H0+H1+W0+W1 != 0) {
747 adf::source(pad[0]) =
"pad.cc";
748 adf::headers(pad[0]) = {
"pad.h"};
749 adf::runtime<ratio>(pad[0]) = 0.6;
751 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
752 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
754 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
755 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
758 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
763 std::vector<TTPARAM> weights,
764 std::vector<int32_t> bias,
772 assert(weights.size() / LCNT <= TILE_BYTES);
774 adf::port<adf::input> pin0;
777 for (
int i = 0; i < LCNT; i++) {
778 std::vector<TTPARAM> wChunk;
779 wChunk.reserve(weights.size() / LCNT);
780 for (
int m = 0; m < M; m++) {
781 wChunk.insert(wChunk.end(),
782 weights.begin() + m*weights.size()/M + i*weights.size()/M/LCNT,
783 weights.begin() + m*weights.size()/M + (i+1)*weights.size()/M/LCNT);
787 k[i] = adf::kernel::create_object<QLINEARCONV0<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
788 wChunk, bias, w_zero);
789 }
else if (i == LCNT-1) {
790 k[i] = adf::kernel::create_object<QLINEARCONV2<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
791 wChunk, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
792 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[1]);
793 adf::connect<adf::stream> (k[i].out[0], pout[0]);
795 k[i] = adf::kernel::create_object<QLINEARCONV1<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
797 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[1]);
800 adf::single_buffer(k[i].in[0]);
801 adf::connect<adf::window<B*CCHUNK*PAD_H*PAD_W>> (split_graph.pout[i], k[i].in[0]);
803 adf::location<adf::buffer>(k[i].in[0]) = adf::location<adf::kernel>(k[i]);
804 adf::location<adf::buffer>(k[i].in[0]) = {adf::offset(0)};
811 std::vector<int32_t> bias,
819 for (
int i = 0; i < LCNT; i++) {
821 adf::port<adf::input> pin0;
825 k[i] = adf::kernel::create_object<QLINEARCONV0<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
827 set_heap_size<QLINEARCONV0,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);
828 }
else if (i == LCNT-1) {
829 k[i] = adf::kernel::create_object<QLINEARCONV2<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
830 x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
831 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[2]);
832 adf::connect<adf::stream> (k[i].out[0], pout[0]);
833 set_heap_size<QLINEARCONV2,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);
835 k[i] = adf::kernel::create_object<QLINEARCONV1<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
837 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[2]);
838 set_heap_size<QLINEARCONV1,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);
841 adf::connect<adf::stream> (pin[1+i], k[i].in[1]);
842 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
Vector implementation for Int8 Pad2D Pad2DStreamInt8<a,2,30,30,32,1,1,1,1> total = 1885 for v64int16.
Definition pad.h:51
Multiinstance graph that stores weights and biases, chunks BCHW by C dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:709
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:301
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:557
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:421
Single instance graph that stores weights and biases Max size = 16384 and 4096 bytes respectively.
Definition graph_qlinearconv.h:109
Vector implementation for Hx4 QLinearConv, padding with y_zero, requires data to be arranged in (M,...
Definition qlinearconv.h:450
Vector implementation for Hx4 QLinearConv using 32bit scale for precision, requires data to be arrang...
Definition qlinearconv.h:386
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:320
Vector implementation for Hx4 QLinearConv using int8xint8 MACs, requires data to be arranged in [a,...
Definition qlinearconv.h:828
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:947
Scalar implementation streaming weights, requires weights stream to be padded from MxCxKxK to MxCx16,...
Definition qlinearconv.h:266
Single instance graph that streams weights and biases, significantly slower.
Definition graph_qlinearconv.h:181
Graph wrapper for two stream split.
Definition graph_split.h:185
Graph wrapper for arbitrary split kernel implementation and lanes.
Definition graph_split.h:37