onnx2versal
Loading...
Searching...
No Matches
graph_conv.h
1#ifndef __CONV_GRAPH_H__
2#define __CONV_GRAPH_H__
3
4#include <type_traits>
5#include <adf.h>
6#include "concat.h"
7#include "conv.h"
8#include "pad.h"
9#include "split.h"
10#include "graph_concat.h"
11#include "graph_split.h"
12#include "graph_utils.h"
13
14
15template <template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
16 int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
17 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU>
18void set_heap_size(adf::kernel k) {
19 if (
20 (std::is_same<
21 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
23 ) {
24 adf::heap_size(k) = C/GROUP*KH*KW *4 + 1024; // caches CKK weights
25 }
26 else if (
27 (std::is_same<
28 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
30 (std::is_same<
31 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
33 ) {
34 adf::heap_size(k) = C/GROUP*((KH*KW+3)/4*4) *4 + 1024; // caches CKK weights, padded to 4
35 }
36 else if (
37 (std::is_same<
38 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
40 ) {
41 adf::heap_size(k) = C/GROUP*KH*8 *4 + OUT_W_PAD*4 + 1024; // caches CKK weights, padded to 8 and one OUT_ROW
42 }
43 else if (
44 (std::is_same<
45 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
47 ) {
48 adf::heap_size(k) = C/GROUP*((KH*KW+3)/4*4) *4 + OUT_W_PAD*4 + 1024; // caches CKK weights and one OUT_ROW
49 }
50 else if (
51 (std::is_same<
52 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
54 (std::is_same<
55 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
57 ) {
58 adf::heap_size(k) = (C/GROUP+3)/4*4 *4 + 1024; // caches CKK weights
59 }
60 else if ((std::is_same<
61 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
63 (std::is_same<
64 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
66 ) {
67 adf::heap_size(k) = 31712; // caches CKK weights, input window
68 }
69}
70
115template <template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
116 int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
117 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU,
118 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
119class ConvReluGraph : public adf::graph {
120
121 private:
122 adf::kernel k[1];
123 std::vector<adf::kernel> pad;
124 static constexpr int PAD_H = INP_H + H0 + H1;
125 static constexpr int PAD_W = INP_W + W0 + W1;
126
127 public:
128 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
129
130 adf::port<input> pin[1];
131 adf::port<output> pout[1];
132
134 std::vector<float> weights,
135 std::vector<float> bias,
136 int repeat_cnt = 1
137 ) {
138 static_assert(B*C*PAD_H*PAD_W*4 <= MAX_PARAM_BYTES);
139 assert(weights.size()*4 <= MAX_PARAM_BYTES);
140 static_assert(B*M*OUT_H*OUT_W_PAD*4 <= MAX_PARAM_BYTES);
141
142 k[0] = adf::kernel::create_object<CONV<PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(weights, bias);
143 adf::source(k[0]) = "conv.cc";
144 adf::headers(k[0]) = {"conv.h"};
145 adf::runtime<ratio>(k[0]) = 0.6;
146 adf::repetition_count(k[0]) = repeat_cnt;
147
148 if (H0+H1+W0+W1 != 0) {
149 pad.push_back(
151 adf::source(pad[0]) = "pad.cc";
152 adf::headers(pad[0]) = {"pad.h"};
153 adf::runtime<ratio>(pad[0]) = 0.6;
154 adf::repetition_count(pad[0]) = repeat_cnt;
155
156 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>, adf::stream> (pin[0], pad[0].in[0]);
157 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W*4>> (pad[0].out[0], k[0].in[0]);
158
159 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
160 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
161 } else {
162 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>> (pin[0], k[0].in[0]);
163 }
164
165 adf::connect<adf::window<B*M*OUT_H*OUT_W_PAD*4>> (k[0].out[0], pout[0]);
166
167 adf::location_constraint tilePos = adf::location<adf::kernel>(k[0]);
168 adf::location<adf::parameter>(k[0].param[0]) = tilePos;
169 adf::location<adf::parameter>(k[0].param[0]) = adf::offset(0);
170 adf::location<adf::parameter>(k[0].param[1]) = tilePos;
171 // weights can be padded, not necessarily MCKK
172 // separate bank not required for weights vs bias
173 adf::location<adf::parameter>(k[0].param[1]) = adf::offset((weights.size()*4+31)/32*32);
174 }
175
176};
177
178
188template <template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
189 int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
190 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU,
191 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
192class ConvReluStreamGraph : public adf::graph {
193
194 private:
195 adf::kernel k[1];
196 std::vector<adf::kernel> pad;
197 static constexpr int PAD_H = INP_H + H0 + H1;
198 static constexpr int PAD_W = INP_W + W0 + W1;
199
200 public:
201 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
202
203 adf::port<input> pin[2];
204 adf::port<output> pout[1];
205
207 std::vector<float> bias
208 ) {
209 static_assert(B*C*PAD_H*PAD_W*4 <= TILE_BYTES);
210
211 k[0] = adf::kernel::create_object<CONV<PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
212 adf::source(k[0]) = "conv.cc";
213 adf::headers(k[0]) = {"conv.h"};
214 adf::runtime<ratio>(k[0]) = 0.6;
215 adf::single_buffer(k[0].in[0]);
216
217 set_heap_size<CONV,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[0]);
218
219 if (H0+H1+W0+W1 != 0) {
220 pad.push_back(
222 adf::source(pad[0]) = "pad.cc";
223 adf::headers(pad[0]) = {"pad.h"};
224 adf::runtime<ratio>(pad[0]) = 0.6;
225
226 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
227 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W*4>> (pad[0].out[0], k[0].in[0]);
228
229 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
230 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
231
232 } else {
233 adf::connect<adf::stream, adf::window<B*C*INP_H*INP_W_PAD*4>> (pin[0], k[0].in[0]);
234 }
235
236 adf::connect<adf::stream> (pin[1], k[0].in[1]); // variable samples per iteration based on kernel
237 adf::connect<adf::stream> (k[0].out[0], pout[0]);
238
239 adf::samples_per_iteration(k[0].out[0]) = B*M*OUT_H*OUT_W_PAD;
240
241 adf::location<adf::buffer>(k[0].in[0]) = adf::location<adf::kernel>(k[0]);
242 adf::location<adf::buffer>(k[0].in[0]) = adf::offset(0);
243 }
244
245};
246
247
259template <
260 template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
261 template<typename, int, int, int, int> class CONCAT,
262 int IS_BCHW, int MCHUNK,
263 int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
264 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU,
265 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
266class ConvReluChunkMGraph : public adf::graph {
267
268 private:
269 static constexpr int CHUNK_COUNT = (M + MCHUNK - 1) / MCHUNK; // ceiling
270 static constexpr int CHUNK_REM = M % MCHUNK;
271
272 static constexpr int PAD_H = INP_H + H0 + H1;
273 static constexpr int PAD_W = INP_W + W0 + W1;
274
275 adf::relative_coordinate tileOffsets[8] = {
276 {.col_offset = -1, .row_offset = 0}, // left, right
277 {.col_offset = 1, .row_offset = 0},
278 {.col_offset = -1, .row_offset = 1}, // bottom row
279 {.col_offset = 0, .row_offset = 1},
280 {.col_offset = 1, .row_offset = 1},
281 {.col_offset = -1, .row_offset = -1}, // top row
282 {.col_offset = 0, .row_offset = -1},
283 {.col_offset = 1, .row_offset = -1},
284 };
285
286 adf::kernel k[CHUNK_COUNT];
287 std::vector<adf::kernel> pad;
288
289 public:
290 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
291 static constexpr int CONCAT_W = (IS_BCHW) ? MCHUNK*OUT_H*OUT_W_PAD : MCHUNK;
292 static constexpr int CONCAT_BLOCK = (IS_BCHW) ? M*OUT_H*OUT_W_PAD : M;
293 static constexpr int CONCAT_H = (IS_BCHW) ? B : B*OUT_H*OUT_W_PAD;
295
296 adf::port<input> pin[1];
297 adf::port<output> pout[1];
298
300 std::vector<float> weights,
301 std::vector<float> bias
302 ) {
303 static_assert(CHUNK_COUNT <= 8);
304 static_assert(B*C*PAD_H*PAD_W*4 <= MAX_PARAM_BYTES);
305 assert(weights.size() <= MAX_PARAM_BYTES*8);
306 static_assert(B*M*OUT_H*OUT_W_PAD*4 <= MAX_PARAM_BYTES*8);
307
308 std::vector<float> wChunk;
309 std::vector<float> bChunk;
310 int CKK = weights.size() / M;
311
312 for (int i = 0; i < CHUNK_COUNT; i++) {
313 int chunkSize = (i*MCHUNK + MCHUNK > M) ? CHUNK_REM : MCHUNK;
314 wChunk = std::vector<float>(weights.begin()+i*MCHUNK*CKK,
315 weights.begin()+(i*MCHUNK+chunkSize)*CKK);
316 wChunk.resize(MCHUNK*CKK, 0);
317 bChunk = std::vector<float>(bias.begin()+i*MCHUNK, bias.begin()+i*MCHUNK+chunkSize);
318 bChunk.resize(MCHUNK, 0);
319
320 k[i] = adf::kernel::create_object<CONV<PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,MCHUNK,KH,KW,GROUP,IS_RELU>>(wChunk, bChunk);
321 adf::source(k[i]) = "conv.cc";
322 adf::headers(k[i]) = {"conv.h"};
323 adf::runtime<ratio>(k[i]) = 0.6;
324
325 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(concat_g.k[0]) +
326 adf::relative_offset(tileOffsets[i]);
327 adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);
328 adf::location<adf::parameter>(k[i].param[0]) = tilePos;
329 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
330 adf::location<adf::parameter>(k[i].param[1]) = tilePos;
331 adf::location<adf::parameter>(k[i].param[1]) = adf::offset((MCHUNK*CKK*4+31)/32*32);
332 // input window and output window can be much larger
333 }
334
335 if (H0+H1+W0+W1 != 0) {
336 pad.push_back(
338 adf::source(pad[0]) = "pad.cc";
339 adf::headers(pad[0]) = {"pad.h"};
340 adf::runtime<ratio>(pad[0]) = 0.6;
341
342 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>, adf::stream> (pin[0], pad[0].in[0]);
343 for (int i = 0; i < CHUNK_COUNT; i++)
344 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W*4>> (pad[0].out[0], k[i].in[0]);
345
346 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
347 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
348 } else {
349 for (int i = 0; i < CHUNK_COUNT; i++)
350 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>> (pin[0], k[i].in[0]);
351 }
352
353 for (int i = 0; i < CHUNK_COUNT; i++)
354 adf::connect<adf::window<B*MCHUNK*OUT_H*OUT_W_PAD*4>> (k[i].out[0], concat_g.pin[i]);
355 adf::connect<adf::stream> (concat_g.pout[0], pout[0]);
356 }
357};
358
359
369template <
370 template<typename, int, int, int, int> class SPLIT,
371 template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
372 template<typename, int, int, int, int> class CONCAT,
373 int HCHUNK,
374 int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
375 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU,
376 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
377class ConvReluChunkHGraph : public adf::graph {
378
379 private:
380 static constexpr int PAD_H = INP_H + H0 + H1;
381 static constexpr int PAD_W = INP_W + W0 + W1;
382
383 std::vector<adf::kernel> pad;
384
385 static constexpr int OVERLAP = KH-1;
387 mSplitGraph split_graph;
388 static constexpr int LCNT = mSplitGraph::LCNT;
389
390 adf::kernel k[LCNT];
391
392 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
393 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
394 ConcatStreamGraph<CONCAT, float_t, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
395
396 adf::relative_coordinate tileOffsets[8] = {
397 {.col_offset = -1, .row_offset = 1}, // top left, clockwise
398 {.col_offset = 0, .row_offset = 2},
399 {.col_offset = 0, .row_offset = 1},
400 {.col_offset = 1, .row_offset = 0},
401 {.col_offset = 0, .row_offset = -1},
402 {.col_offset = 0, .row_offset = -2},
403 {.col_offset = -1, .row_offset = -1},
404 {.col_offset = -1, .row_offset = 0},
405 };
406
407 adf::relative_coordinate concat_k1_offsets[4] = {
408 {.col_offset = -1, .row_offset = 2}, // top left, clockwise
409 {.col_offset = 1, .row_offset = 1},
410 {.col_offset = 1, .row_offset = -1},
411 {.col_offset = -1, .row_offset = -2},
412 };
413
414 public:
415 adf::port<adf::input> pin[2];
416 adf::port<adf::output> pout[1];
417
419 std::vector<float> bias
420 ) {
421 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
422 static_assert(LCNT <= 8);
423 static_assert(B*C*PAD_H*PAD_W*4 + B*C*(KH-1)*7*PAD_W*4 <= MAX_PARAM_BYTES*8);
424
425 if (H0+H1+W0+W1 != 0) {
426 pad.push_back(
428 adf::source(pad[0]) = "pad.cc";
429 adf::headers(pad[0]) = {"pad.h"};
430 adf::runtime<ratio>(pad[0]) = 0.6;
431
432 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
433 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
434
435 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
436 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
437 // split and pad can't be placed on same tile due to stream co-placement constraints
438 } else {
439 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
440 }
441
442 for (int i = 0; i < LCNT; i++) {
443 k[i] = adf::kernel::create_object<CONV<HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
444 adf::source(k[i]) = "conv.cc";
445 adf::headers(k[i]) = {"conv.h"};
446 adf::runtime<ratio>(k[i]) = 0.6;
447 adf::single_buffer(k[i].in[0]);
448
449 set_heap_size<CONV,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[i]);
450
451 adf::connect<adf::window<B*C*HCHUNK*PAD_W*4>> (split_graph.pout[i], k[i].in[0]);
452 adf::connect<adf::stream> (pin[1], k[i].in[1]);
453 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
454
455 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
456
457 adf::location<adf::kernel>(k[i]) =
458 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(tileOffsets[i]);
459 adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);
460 adf::location<adf::parameter>(k[i].param[0]) = tilePos; // may bust tiles adjacent to split
461 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
462 }
463 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
464
465 for (int i = 0; i < concat_graph.k1.size(); i++) {
466 adf::location<adf::kernel>(concat_graph.k1[i]) =
467 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(concat_k1_offsets[i]);
468 }
469 }
470};
471
472
482template <
483 template<typename, int, int, int, int> class SPLIT,
484 template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
485 template<typename, int, int, int, int> class CONCAT,
486 int HCHUNK,
487 int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
488 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU,
489 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
490class ConvReluChunkHStreamGraph : public adf::graph {
491
492 private:
493 static constexpr int PAD_H = INP_H + H0 + H1;
494 static constexpr int PAD_W = INP_W + W0 + W1;
495
496 std::vector<adf::kernel> pad;
497
498 static constexpr int OVERLAP = KH-1;
499 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
500 adf::kernel split[(LCNT+1)/2];
501 adf::kernel k[LCNT];
502
503 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
504 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
505 ConcatStreamGraph<CONCAT, float_t, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
506
507 public:
508 adf::port<adf::input> pin[2];
509 adf::port<adf::output> pout[1];
510
512 std::vector<float> bias
513 ) {
514 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
515
516 for (int i = 0; i < LCNT/2; i++) {
517 split[i] = adf::kernel::create_object<SplitFilterFloatStreamTwice<float_t, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(i*2);
518 adf::source(split[i]) = "split.cc";
519 adf::headers(split[i]) = {"split.h"};
520 adf::runtime<ratio>(split[i]) = 0.6;
521
522 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
523 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
524 adf::samples_per_iteration(split[i].out[1]) = B*C*HCHUNK*PAD_W;
525 }
526 if ((LCNT & 0x1) == 1) {
527 int i = (LCNT+1)/2 - 1;
528 split[i] = adf::kernel::create_object<SplitFilterFloatStream<float_t, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(LCNT-1);
529 adf::source(split[i]) = "split.cc";
530 adf::headers(split[i]) = {"split.h"};
531 adf::runtime<ratio>(split[i]) = 0.6;
532
533 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
534 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
535 }
536
537 for (int i = 0; i < LCNT; i++) {
538 k[i] = adf::kernel::create_object<CONV<HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
539 adf::source(k[i]) = "conv.cc";
540 adf::headers(k[i]) = {"conv.h"};
541 adf::runtime<ratio>(k[i]) = 0.6;
542 adf::single_buffer(k[i].in[0]);
543
544 set_heap_size<CONV,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[i]);
545
546 adf::connect<adf::stream, adf::window<B*C*HCHUNK*PAD_W*4>> (split[i/2].out[i&0x1], k[i].in[0]);
547 adf::connect<adf::stream> (pin[1], k[i].in[1]);
548 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
549
550 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
551
552 adf::location<adf::buffer>(k[i].in[0]) = adf::location<adf::kernel>(k[i]);
553 adf::location<adf::buffer>(k[i].in[0]) = {adf::offset(0)};
554 }
555
556 for (int i = 0; i < (LCNT+1)/2; i++) {
557 adf::location<adf::kernel>(split[i]) =
558 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=1, .row_offset=0});
559
560 adf::location_constraint sTilePos = adf::location<adf::kernel>(split[i]);
561 adf::location<adf::stack>(split[i]) = sTilePos;
562 adf::location<adf::stack>(k[i*2]) = sTilePos;
563 adf::location<adf::parameter>(k[i*2].param[0]) = sTilePos;
564
565 if (i*2+1 < LCNT) {
566 adf::location<adf::kernel>(k[i*2+1]) = sTilePos + adf::relative_offset({.col_offset=0, .row_offset=1});
567 }
568 // if (i*2+2 < LCNT) {
569 // adf::location<adf::kernel>(k[i*2+2]) = sTilePos + adf::relative_offset({.col_offset=2, .row_offset=0});
570 // }
571 }
572 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
573
574 if (H0+H1+W0+W1 != 0) {
575 pad.push_back(
577 adf::source(pad[0]) = "pad.cc";
578 adf::headers(pad[0]) = {"pad.h"};
579 adf::runtime<ratio>(pad[0]) = 0.6;
580
581 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
582 for (int i = 0; i < (LCNT+1)/2; i++)
583 adf::connect<adf::stream> (pad[0].out[0], split[i].in[0]);
584
585 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
586 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
587 // split and pad can't be placed on same tile due to stream co-placement constraints
588 } else {
589 for (int i = 0; i < (LCNT+1)/2; i++)
590 adf::connect<adf::stream> (pin[0], split[i].in[0]);
591 }
592
593 for (int i = 0; i < concat_graph.k1.size(); i++) {
594 adf::location<adf::kernel>(concat_graph.k1[i]) =
595 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
596
597 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
598 adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;
599 adf::location<adf::stack>(k[i*2+1]) = cTilePos;
600 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
601 }
602 }
603};
604
605
615template <
616 template<typename, int, int, int, int> class SPLIT,
617 template<int, int, int, int, int, int, int, int, int, int, int, int, int> class CONV,
618 template<typename, int, int, int, int> class CONCAT,
619 int HCHUNK,
620 int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,
621 int B, int C, int M, int KH, int KW, int GROUP, int IS_RELU,
622 int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>
623class ConvReluChunkHPktStreamGraph : public adf::graph {
624
625 private:
626 static constexpr int PAD_H = INP_H + H0 + H1;
627 static constexpr int PAD_W = INP_W + W0 + W1;
628
629 std::vector<adf::kernel> pad;
630
631 static constexpr int OVERLAP = KH-1;
633 mSplitGraph split_graph;
634
635 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
636 adf::kernel k[LCNT];
637
638 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
639 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
640 ConcatStreamGraph<CONCAT, float_t, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
641
642 public:
643 adf::port<adf::input> pin[2];
644 adf::port<adf::output> pout[2];
645
647 std::vector<float> bias
648 ) {
649 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
650
651 for (int i = 0; i < LCNT; i++) {
652 k[i] = adf::kernel::create_object<CONV<HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
653 adf::source(k[i]) = "conv.cc";
654 adf::headers(k[i]) = {"conv.h"};
655 adf::runtime<ratio>(k[i]) = 0.6;
656
657 set_heap_size<CONV,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[i]);
658
659 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
660 adf::connect<adf::stream> (pin[1], k[i].in[1]);
661 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
662
663 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
664
665 if ((i&0x1) == 1) {
666 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
667 }
668 if (i == 2) {
669 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=2});
670 }
671 adf::location<adf::stack>(k[i]) = adf::location<adf::kernel>(k[i]);
672
673 }
674
675 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
676
677 if (H0+H1+W0+W1 != 0) {
678 pad.push_back(
680 adf::source(pad[0]) = "pad.cc";
681 adf::headers(pad[0]) = {"pad.h"};
682 adf::runtime<ratio>(pad[0]) = 0.6;
683
684 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
685 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
686 adf::connect<adf::stream> (pad[0].out[0], pout[1]);
687
688 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
689 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
690 // split and pad can't be placed on same tile due to stream co-placement constraints
691 } else {
692 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
693 }
694
695 for (int i = 0; i < concat_graph.k1.size(); i++) {
696 adf::location<adf::kernel>(concat_graph.k1[i]) =
697 adf::location<adf::kernel>(k[i*2+1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
698
699 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
700 adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;
701 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
702 }
703 }
704};
708#endif // __CONV_GRAPH_H__
Graph wrapper for arbitrary concat kernel implementation and lanes.
Definition graph_concat.h:37
Vector stream implementation for OUT_W == 4 < 8, stores biases, requires KH==KW==1,...
Definition conv.h:450
Vector stream implementation for BCHW, stores biases, requires KH==KW==1, INP_W%4==0,...
Definition conv.h:536
Vector stream implementation for BCHW, stores biases, requires KH==KW==1, INP_W%4==0,...
Definition conv.h:409
Vector stream implementation for OUT_W == 4 < 8, stores biases, requires KW<=3, INP_W%4==0,...
Definition conv.h:368
Vector stream implementation for BCHW, stores biases, requires KW<=3, INP_W%4==0, OUT_W_PAD%(8|4)==0,...
Definition conv.h:490
Vector stream implementation for BCHW, stores biases, requires KH==KW==3, INP_W%4==0,...
Definition conv.h:325
Vector stream implementation for BCHW, stores biases, requires KW<=3, INP_W%4==0, OUT_W_PAD%(8|4)==0,...
Definition conv.h:281
Scalar stream implementation for BCHW, stores biases, requires GROUP==1, ConvHx8ReluStream<28,...
Definition conv.h:238
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_conv.h:377
Multiinstance graph that stores biases, chunks BCHW by H dimension.
Definition graph_conv.h:623
Multiinstance graph that stores biases, chunks BCHW by H dimension.
Definition graph_conv.h:490
Multiinstance graph that stores weights and biases, chunks MCKK weights by M dimension,...
Definition graph_conv.h:266
Single instance graph that stores weights and biases.
Definition graph_conv.h:119
Scalar stream implementation for BCHW, stores biases, requires GROUP==1, ConvReluScalarStream<26,...
Definition conv.h:202
Single instance graph that streams weights and biases, significantly slower.
Definition graph_conv.h:192
Vector implementation for Float Pad2D Pad2DStreamFloat<f,2,30,30,32,1,1,1,1> total = 2304.
Definition pad.h:23
Graph wrapper for two stream split.
Definition graph_split.h:185
Graph wrapper for arbitrary split kernel implementation and lanes.
Definition graph_split.h:37