onnx2versal
Loading...
Searching...
No Matches
graph_qlinearconv.h
1#ifndef __QLINEARCONV_GRAPH_H__
2#define __QLINEARCONV_GRAPH_H__
3
4#include <adf.h>
5#include "qlinearconv.h"
6#include "pad.h"
7#include "split.h"
8#include "graph_concat.h"
9#include "graph_split.h"
10#include "graph_utils.h"
11
12
13template <template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,
14 typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
15 int B, int C, int M, int KH, int KW, int GROUP>
16void set_heap_size(adf::kernel k) {
17 if (
18 (std::is_same<
19 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
21 (std::is_same<
22 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
24 (std::is_same<
25 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
27 (std::is_same<
28 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
30 ) {
31 adf::heap_size(k) = C/GROUP*((KH*KW+15)/16*16) + 1024; // caches CKK weights
32 }
33 else if (
34 (std::is_same<
35 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
37 (std::is_same<
38 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
39 QLinearConvHx4Stream_0<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
40 (std::is_same<
41 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
42 QLinearConvHx4Stream_1<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
43 (std::is_same<
44 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
45 QLinearConvHx4Stream_2<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||
46 (std::is_same<
47 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
49 (std::is_same<
50 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
52 (std::is_same<
53 QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,
55 ) {
56 adf::heap_size(k) = 31712; // caches CKK weights, input window
57 }
58}
59
60
103template <
104 template<typename, int, int, int, int, int, int, int, int> class PAD,
105 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,
106 typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
107 int B, int C, int M, int KH, int KW, int GROUP,
108 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
109class QLinearConvGraph : public adf::graph {
110
111 private:
112 adf::kernel k[1];
113 std::vector<adf::kernel> pad;
114 static constexpr int PAD_H = INP_H + H0 + H1;
115 static constexpr int PAD_W = INP_W + W0 + W1;
116
117 public:
118 adf::port<input> pin[1];
119 adf::port<output> pout[1];
120 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
121
123 std::vector<TTPARAM> weights,
124 std::vector<int32_t> bias,
125 float x_scale,
126 float w_scale,
127 float y_scale,
128 TT x_zero,
129 TTPARAM w_zero,
130 TT y_zero
131 ) {
132 static_assert(B*C*PAD_H*PAD_W <= MAX_PARAM_BYTES);
133 assert(weights.size() <= MAX_PARAM_BYTES);
134 static_assert(B*M*OUT_H*OUT_W_PAD <= MAX_PARAM_BYTES);
135
136 k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
137 weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
138 adf::source(k[0]) = "qlinearconv.cc";
139 adf::headers(k[0]) = {"qlinearconv.h"};
140 adf::runtime<ratio>(k[0]) = 0.6;
141
142 if (H0+H1+W0+W1 != 0) {
143 pad.push_back(
144 adf::kernel::create_object<PAD<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));
145 adf::source(pad[0]) = "pad.cc";
146 adf::headers(pad[0]) = {"pad.h"};
147 adf::runtime<ratio>(pad[0]) = 0.6;
148
149 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
150 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W>> (pad[0].out[0], k[0].in[0]);
151 } else {
152 adf::connect<adf::window<B*C*INP_H*INP_W_PAD>> (pin[0], k[0].in[0]);
153 }
154
155 adf::connect<adf::window<B*M*OUT_H*OUT_W_PAD>> (k[0].out[0], pout[0]);
156
157 adf::location_constraint tilePos = adf::location<adf::kernel>(k[0]);
158 adf::location<adf::parameter>(k[0].param[0]) = tilePos;
159 adf::location<adf::parameter>(k[0].param[0]) = adf::offset(0);
160 adf::location<adf::parameter>(k[0].param[1]) = tilePos;
161 }
162
163};
164
165
175template <
176 template<typename, int, int, int, int, int, int, int, int> class PAD,
177 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,
178 typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
179 int B, int C, int M, int KH, int KW, int GROUP,
180 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
181class QLinearConvStreamGraph : public adf::graph {
182
183 private:
184 adf::kernel k[1];
185 std::vector<adf::kernel> pad;
186 static constexpr int PAD_H = INP_H + H0 + H1;
187 static constexpr int PAD_W = INP_W + W0 + W1;
188
189 public:
190 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
191
192 adf::vector<adf::port<input>> pin;
193 adf::port<output> pout[1];
194
195 void init_helper(TT x_zero) {
196 adf::source(k[0]) = "qlinearconv.cc";
197 adf::headers(k[0]) = {"qlinearconv.h"};
198 adf::runtime<ratio>(k[0]) = 0.6;
199 if (B*C*PAD_H*PAD_W > MAX_PARAM_BYTES)
200 adf::single_buffer(k[0].in[0]);
201
202 set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[0]);
203
204 if (H0+H1+W0+W1 != 0) {
205 pad.push_back(
206 adf::kernel::create_object<PAD<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));
207 adf::source(pad[0]) = "pad.cc";
208 adf::headers(pad[0]) = {"pad.h"};
209 adf::runtime<ratio>(pad[0]) = 0.6;
210
211 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
212 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W>> (pad[0].out[0], k[0].in[0]);
213
214 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
215 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
216
217 adf::location<adf::kernel>(pad[0]) = adf::location<adf::kernel>(k[0]) +
218 adf::relative_offset({.col_offset=0, .row_offset=1});
219
220 adf::location_constraint padTile = adf::location<adf::kernel>(pad[0]);
221 adf::location<adf::stack>(pad[0]) = padTile;
222 adf::location<adf::stack>(k[0]) = padTile;
223 adf::location<adf::parameter>(k[0].param[0]) = padTile;
224 } else {
225 adf::connect<adf::window<B*C*INP_H*INP_W_PAD>> (pin[0], k[0].in[0]);
226 }
227
228 adf::connect<adf::stream> (k[0].out[0], pout[0]);
229 adf::samples_per_iteration(k[0].out[0]) = B*M*OUT_H*OUT_W_PAD;
230
231 if (B*C*PAD_H*PAD_W > MAX_PARAM_BYTES) {
232 adf::location<adf::buffer>(k[0].in[0]) = {adf::offset(0)};
233 } else {
234 adf::location<adf::buffer>(k[0].in[0]) = {adf::offset(0), adf::offset(16384)};
235 }
236 }
237
239 std::vector<int32_t> bias,
240 float x_scale,
241 float w_scale,
242 float y_scale,
243 TT x_zero,
244 TTPARAM w_zero,
245 TT y_zero
246 ) {
247 static_assert(B*C*PAD_H*PAD_W <= TILE_BYTES);
248 k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
249 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
250
251 adf::port<adf::input> pin0;
252 adf::port<adf::input> pin1;
253 pin.push_back(pin0);
254 pin.push_back(pin1);
255
256 adf::connect<adf::stream> (pin[1], k[0].in[1]); // variable samples per iteration based on kernel
257
258 init_helper(x_zero);
259 }
260
262 std::vector<TTPARAM> weights,
263 std::vector<int32_t> bias,
264 float x_scale,
265 float w_scale,
266 float y_scale,
267 TT x_zero,
268 TTPARAM w_zero,
269 TT y_zero
270 ) {
271 static_assert(B*C*PAD_H*PAD_W <= TILE_BYTES);
272
273 k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
274 weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
275
276 adf::port<adf::input> pin0;
277 pin.push_back(pin0);
278
279 init_helper(x_zero);
280 }
281
282};
283
284
294template <
295 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,
296 template<typename, int, int, int, int> class CONCAT,
297 int HCHUNK,
298 typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
299 int B, int C, int M, int KH, int KW, int GROUP,
300 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
301class QLinearConvChunkHGraph : public adf::graph {
302
303 private:
304 static constexpr int PAD_H = INP_H + H0 + H1;
305 static constexpr int PAD_W = INP_W + W0 + W1;
306
307 std::vector<adf::kernel> pad;
308
309 static constexpr int OVERLAP = KH-STEP_H;
311 mSplitGraph split_graph;
312 static constexpr int LCNT = mSplitGraph::LCNT;
313
314 adf::kernel k[LCNT];
315
316 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
317 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
318 ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
319
320 adf::relative_coordinate tileOffsets[8] = {
321 {.col_offset = -1, .row_offset = 1}, // top left, clockwise
322 {.col_offset = 0, .row_offset = 2},
323 {.col_offset = 0, .row_offset = 1},
324 {.col_offset = 1, .row_offset = 0},
325 {.col_offset = 0, .row_offset = -1},
326 {.col_offset = 0, .row_offset = -2},
327 {.col_offset = -1, .row_offset = -1},
328 {.col_offset = -1, .row_offset = 0},
329 };
330
331 adf::relative_coordinate concat_k1_offsets[4] = {
332 {.col_offset = -1, .row_offset = 2}, // top left, clockwise
333 {.col_offset = 1, .row_offset = 1},
334 {.col_offset = 1, .row_offset = -1},
335 {.col_offset = -1, .row_offset = -2},
336 };
337
338 public:
339 adf::port<adf::input> pin[2];
340 adf::port<adf::output> pout[1];
341
343 std::vector<int32_t> bias,
344 float x_scale,
345 float w_scale,
346 float y_scale,
347 TT x_zero,
348 TTPARAM w_zero,
349 TT y_zero
350 ) {
351 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
352 static_assert(LCNT <= 8);
353 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
354
355 if (H0+H1+W0+W1 != 0) {
356 pad.push_back(
358 adf::source(pad[0]) = "pad.cc";
359 adf::headers(pad[0]) = {"pad.h"};
360 adf::runtime<ratio>(pad[0]) = 0.6;
361
362 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
363 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
364
365 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
366 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
367 // split and pad can't be placed on same tile due to stream co-placement constraints
368 } else {
369 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
370 }
371
372 for (int i = 0; i < LCNT; i++) {
373 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
374 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
375 adf::source(k[i]) = "qlinearconv.cc";
376 adf::headers(k[i]) = {"qlinearconv.h"};
377 adf::runtime<ratio>(k[i]) = 0.6;
378 if (B*C*HCHUNK*PAD_W > MAX_PARAM_BYTES)
379 adf::single_buffer(k[i].in[0]);
380
381 set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);
382
383 adf::connect<adf::window<B*C*HCHUNK*PAD_W>> (split_graph.pout[i], k[i].in[0]);
384 adf::connect<adf::stream> (pin[1], k[i].in[1]);
385 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
386
387 adf::location<adf::kernel>(k[i]) =
388 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(tileOffsets[i]);
389 adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);
390 adf::location<adf::parameter>(k[i].param[0]) = tilePos; // may bust tiles adjacent to split
391 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
392 }
393 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
394
395 for (int i = 0; i < concat_graph.k1.size(); i++) {
396 adf::location<adf::kernel>(concat_graph.k1[i]) =
397 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(concat_k1_offsets[i]);
398 }
399
400 }
401
402};
403
404
414template <
415 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,
416 template<typename, int, int, int, int> class CONCAT,
417 int HCHUNK,
418 typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
419 int B, int C, int M, int KH, int KW, int GROUP,
420 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
421class QLinearConvChunkHStreamGraph : public adf::graph {
422
423 private:
424 static constexpr int PAD_H = INP_H + H0 + H1;
425 static constexpr int PAD_W = INP_W + W0 + W1;
426
427 std::vector<adf::kernel> pad;
428
429 static constexpr int OVERLAP = KH-STEP_H;
430 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
431 adf::kernel split[(LCNT+1)/2];
432 adf::kernel k[LCNT];
433
434 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
435 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
436 ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
437
438 public:
439 adf::port<adf::input> pin[2];
440 adf::port<adf::output> pout[1];
441
443 std::vector<int32_t> bias,
444 float x_scale,
445 float w_scale,
446 float y_scale,
447 TT x_zero,
448 TTPARAM w_zero,
449 TT y_zero
450 ) {
451 static_assert((HCHUNK - KH + 1) % STEP_H == 0);
452 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
453
454 for (int i = 0; i < LCNT/2; i++) {
455 split[i] = adf::kernel::create_object<SplitFilterInt8StreamTwice<TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(i*2);
456 adf::source(split[i]) = "split.cc";
457 adf::headers(split[i]) = {"split.h"};
458 adf::runtime<ratio>(split[i]) = 0.6;
459
460 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
461 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
462 adf::samples_per_iteration(split[i].out[1]) = B*C*HCHUNK*PAD_W;
463 }
464 if ((LCNT & 0x1) == 1) {
465 int i = (LCNT+1)/2 - 1;
466 split[i] = adf::kernel::create_object<SplitFilterInt8Stream<TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(LCNT-1);
467 adf::source(split[i]) = "split.cc";
468 adf::headers(split[i]) = {"split.h"};
469 adf::runtime<ratio>(split[i]) = 0.6;
470
471 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
472 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
473 }
474
475 for (int i = 0; i < LCNT; i++) {
476 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
477 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
478 adf::source(k[i]) = "qlinearconv.cc";
479 adf::headers(k[i]) = {"qlinearconv.h"};
480 adf::runtime<ratio>(k[i]) = 0.6;
481 if (B*C*HCHUNK*PAD_W > MAX_PARAM_BYTES)
482 adf::single_buffer(k[i].in[0]);
483
484 set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);
485
486 adf::connect<adf::window<B*C*HCHUNK*PAD_W>> (split[i/2].out[i&0x1], k[i].in[0]);
487 adf::connect<adf::stream> (pin[1], k[i].in[1]);
488 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
489
490 if ((i & 0x1) != 0) {
491 adf::location<adf::kernel>(k[i]) =
492 adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=1, .row_offset=0});
493 adf::location<adf::kernel>(split[i/2]) =
494 adf::location<adf::kernel>(k[i]) + adf::relative_offset({.col_offset=0, .row_offset=-1});
495
496 adf::location_constraint sTilePos = adf::location<adf::kernel>(split[i/2]);
497 adf::location<adf::stack>(split[i/2]) = sTilePos;
498 adf::location<adf::stack>(k[i]) = sTilePos;
499 adf::location<adf::parameter>(k[i].param[0]) = sTilePos;
500 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
501 }
502
503 adf::location_constraint kTilePos = adf::location<adf::kernel>(k[i]);
504 }
505 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
506
507 if (H0+H1+W0+W1 != 0) {
508 pad.push_back(
510 adf::source(pad[0]) = "pad.cc";
511 adf::headers(pad[0]) = {"pad.h"};
512 adf::runtime<ratio>(pad[0]) = 0.6;
513
514 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
515 for (int i = 0; i < (LCNT+1)/2; i++)
516 adf::connect<adf::stream> (pad[0].out[0], split[i].in[0]);
517
518 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
519 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
520 // split and pad can't be placed on same tile due to stream co-placement constraints
521 } else {
522 for (int i = 0; i < (LCNT+1)/2; i++)
523 adf::connect<adf::stream> (pin[0], split[i].in[0]);
524 }
525
526 for (int i = 0; i < concat_graph.k1.size(); i++) {
527 adf::location<adf::kernel>(concat_graph.k1[i]) =
528 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
529
530 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
531 adf::location<adf::parameter>(k[i*2].param[0]) = cTilePos;
532 adf::location<adf::parameter>(k[i*2].param[0]) = adf::offset(0);
533 adf::location<adf::stack>(k[i*2]) = cTilePos;
534 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
535 }
536 }
537
538};
539
540
550template <
551 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,
552 template<typename, int, int, int, int> class CONCAT,
553 int HCHUNK,
554 typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
555 int B, int C, int M, int KH, int KW, int GROUP,
556 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
557class QLinearConvChunkHPktStreamGraph : public adf::graph {
558
559 private:
560 static constexpr int PAD_H = INP_H + H0 + H1;
561 static constexpr int PAD_W = INP_W + W0 + W1;
562
563 std::vector<adf::kernel> pad;
564
565 static constexpr int OVERLAP = KH-STEP_H;
567 mSplitGraph split_graph;
568
569 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
570 adf::kernel k[LCNT];
571
572 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
573 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
574 ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
575
576 public:
577 std::vector<adf::port<adf::input>> pin;
578 adf::port<adf::output> pout[1];
579
580 void init_helper(TT x_zero) {
581 for (int i = 0; i < LCNT; i++) {
582 adf::source(k[i]) = "qlinearconv.cc";
583 adf::headers(k[i]) = {"qlinearconv.h"};
584 adf::runtime<ratio>(k[i]) = 0.6;
585
586 set_heap_size<QLINEARCONV,TT,TTPARAM,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);
587
588 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
589 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
590 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
591 }
592
593 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
594
595 if (H0+H1+W0+W1 != 0) {
596 pad.push_back(
598 adf::source(pad[0]) = "pad.cc";
599 adf::headers(pad[0]) = {"pad.h"};
600 adf::runtime<ratio>(pad[0]) = 0.6;
601
602 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
603 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
604
605 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
606 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
607 // split and pad can't be placed on same tile due to stream co-placement constraints
608 } else {
609 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
610 }
611
612 // location constraints
613 for (int i = 0; i < LCNT; i++) {
614 if ((i&0x1) == 1)
615 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=2});
616 if (i == 4)
617 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=-1, .row_offset=2});
618 if (i == 2 || i == 6)
619 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-2]) + adf::relative_offset({.col_offset=1, .row_offset=0});
620 adf::location<adf::stack>(k[i]) = adf::location<adf::kernel>(k[i]);
621 }
622
623 for (int i = 0; i < concat_graph.k1.size(); i++) {
624 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
625 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
626
627 if (i < 2) {
628 adf::location<adf::kernel>(concat_graph.k1[i]) =
629 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
630 adf::location<adf::parameter>(k[i*2].param[0]) = cTilePos;
631 } else {
632 adf::location<adf::kernel>(concat_graph.k1[i]) =
633 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
634 adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;
635 }
636 }
637 adf::location<adf::kernel>(split_graph.k[0]) = adf::location<adf::kernel>(k[1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
638 }
639
641 std::vector<int32_t> bias,
642 float x_scale,
643 float w_scale,
644 float y_scale,
645 TT x_zero,
646 TTPARAM w_zero,
647 TT y_zero
648 ) {
649 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
650 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
651
652 adf::port<adf::input> pin0;
653 adf::port<adf::input> pin1;
654 pin.push_back(pin0);
655 pin.push_back(pin1);
656
657 for (int i = 0; i < LCNT; i++) {
658 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
659 bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
660 adf::connect<adf::stream> (pin[1], k[i].in[1]);
661 }
662 init_helper(x_zero);
663 }
664
666 std::vector<TTPARAM> weights,
667 std::vector<int32_t> bias,
668 float x_scale,
669 float w_scale,
670 float y_scale,
671 TT x_zero,
672 TTPARAM w_zero,
673 TT y_zero
674 ) {
675 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
676 static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);
677
678 adf::port<adf::input> pin0;
679 pin.push_back(pin0);
680
681 for (int i = 0; i < LCNT; i++) {
682 k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(
683 weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
684 }
685 init_helper(x_zero);
686 }
687
688};
689
690
700template <
701 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV0,
702 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV1,
703 template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV2,
704 template<typename, int, int, int, int> class CONCAT,
705 int CCHUNK,
706 typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
707 int B, int C, int M, int KH, int KW, int GROUP,
708 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
709class QLinearConvChunkCGraph : public adf::graph {
710
711 private:
712 static constexpr int PAD_H = INP_H + H0 + H1;
713 static constexpr int PAD_W = INP_W + W0 + W1;
714
715 std::vector<adf::kernel> pad;
716
718 mSplitGraph split_graph;
719
720 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
721
722 public:
723 static constexpr int LCNT = C / CCHUNK;
724 adf::kernel k[LCNT];
725
726 std::vector<adf::port<adf::input>> pin;
727 adf::port<adf::output> pout[1];
728
729 void init_helper(TT x_zero) {
730 static_assert(LCNT >= 3);
731 static_assert(C % CCHUNK == 0);
732 static_assert(B*CCHUNK*PAD_H*PAD_W <= TILE_BYTES);
733
734 for (int i = 0; i < LCNT; i++) {
735 adf::source(k[i]) = "qlinearconv.cc";
736 adf::headers(k[i]) = {"qlinearconv.h"};
737 adf::runtime<ratio>(k[i]) = 0.6;
738
739 if (i != 0) {
740 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=1});
741 }
742 }
743
744 if (H0+H1+W0+W1 != 0) {
745 pad.push_back(
747 adf::source(pad[0]) = "pad.cc";
748 adf::headers(pad[0]) = {"pad.h"};
749 adf::runtime<ratio>(pad[0]) = 0.6;
750
751 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
752 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
753
754 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
755 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
756 // split and pad can't be placed on same tile due to stream co-placement constraints
757 } else {
758 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
759 }
760 }
761
763 std::vector<TTPARAM> weights,
764 std::vector<int32_t> bias,
765 float x_scale,
766 float w_scale,
767 float y_scale,
768 TT x_zero,
769 TTPARAM w_zero,
770 TT y_zero
771 ) {
772 assert(weights.size() / LCNT <= TILE_BYTES); // weight size may vary based on padding done for given kernel
773
774 adf::port<adf::input> pin0;
775 pin.push_back(pin0);
776
777 for (int i = 0; i < LCNT; i++) {
778 std::vector<TTPARAM> wChunk; // build wChunk
779 wChunk.reserve(weights.size() / LCNT);
780 for (int m = 0; m < M; m++) {
781 wChunk.insert(wChunk.end(),
782 weights.begin() + m*weights.size()/M + i*weights.size()/M/LCNT,
783 weights.begin() + m*weights.size()/M + (i+1)*weights.size()/M/LCNT);
784 }
785
786 if (i == 0) {
787 k[i] = adf::kernel::create_object<QLINEARCONV0<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
788 wChunk, bias, w_zero);
789 } else if (i == LCNT-1) {
790 k[i] = adf::kernel::create_object<QLINEARCONV2<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
791 wChunk, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
792 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[1]);
793 adf::connect<adf::stream> (k[i].out[0], pout[0]);
794 } else {
795 k[i] = adf::kernel::create_object<QLINEARCONV1<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
796 wChunk, w_zero);
797 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[1]);
798 }
799
800 adf::single_buffer(k[i].in[0]);
801 adf::connect<adf::window<B*CCHUNK*PAD_H*PAD_W>> (split_graph.pout[i], k[i].in[0]);
802
803 adf::location<adf::buffer>(k[i].in[0]) = adf::location<adf::kernel>(k[i]);
804 adf::location<adf::buffer>(k[i].in[0]) = {adf::offset(0)};
805 }
806
807 init_helper(x_zero);
808 }
809
811 std::vector<int32_t> bias,
812 float x_scale,
813 float w_scale,
814 float y_scale,
815 TT x_zero,
816 TTPARAM w_zero,
817 TT y_zero
818 ) {
819 for (int i = 0; i < LCNT; i++) {
820
821 adf::port<adf::input> pin0;
822 pin.push_back(pin0);
823
824 if (i == 0) {
825 k[i] = adf::kernel::create_object<QLINEARCONV0<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
826 bias, w_zero);
827 set_heap_size<QLINEARCONV0,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);
828 } else if (i == LCNT-1) {
829 k[i] = adf::kernel::create_object<QLINEARCONV2<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
830 x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);
831 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[2]);
832 adf::connect<adf::stream> (k[i].out[0], pout[0]);
833 set_heap_size<QLINEARCONV2,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);
834 } else {
835 k[i] = adf::kernel::create_object<QLINEARCONV1<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(
836 w_zero);
837 adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[2]);
838 set_heap_size<QLINEARCONV1,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);
839 }
840
841 adf::connect<adf::stream> (pin[1+i], k[i].in[1]);
842 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
843 }
844
845 init_helper(x_zero);
846 }
847
848};
852#endif // __QLINEARCONV_GRAPH_H__
Vector implementation for Int8 Pad2D Pad2DStreamInt8<a,2,30,30,32,1,1,1,1> total = 1885 for v64int16.
Definition pad.h:51
Vector implementation for 1x1 QLinearConv, stores weights requires data to be reshaped from (M,...
Definition qlinearconv.h:1077
Vector implementation for 1x1 QLinearConv, streams weights requires data to be reshaped from (M,...
Definition qlinearconv.h:1140
Multiinstance graph that stores weights and biases, chunks BCHW by C dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:709
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:301
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:557
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:421
Single instance graph that stores weights and biases Max size = 16384 and 4096 bytes respectively.
Definition graph_qlinearconv.h:109
Vector implementation for Hx4 QLinearConv, padding with y_zero, requires data to be arranged in (M,...
Definition qlinearconv.h:450
Vector implementation for Hx4 QLinearConv using 32bit scale for precision, requires data to be arrang...
Definition qlinearconv.h:386
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:320
Vector implementation for Hx4 QLinearConv using int8xint8 MACs, requires data to be arranged in [a,...
Definition qlinearconv.h:828
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:947
Scalar implementation streaming weights, requires weights stream to be padded from MxCxKxK to MxCx16,...
Definition qlinearconv.h:266
Single instance graph that streams weights and biases, significantly slower.
Definition graph_qlinearconv.h:181
Graph wrapper for two stream split.
Definition graph_split.h:185
Graph wrapper for arbitrary split kernel implementation and lanes.
Definition graph_split.h:37