1#ifndef __CONV_GRAPH_H__
2#define __CONV_GRAPH_H__
10#include "graph_concat.h"
11#include "graph_split.h"
12#include "graph_utils.h"
15template <
template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
16 int INP_H,
int INP_W,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
17 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU>
18void set_heap_size(adf::kernel k) {
21 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
22 ConvReluScalarStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value)
24 adf::heap_size(k) = C/GROUP*KH*KW *4 + 1024;
28 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
29 ConvHx4ReluStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value) ||
31 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
32 ConvHx4Out4ReluStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value)
34 adf::heap_size(k) = C/GROUP*((KH*KW+3)/4*4) *4 + 1024;
38 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
39 ConvHx8ReluStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value)
41 adf::heap_size(k) = C/GROUP*KH*8 *4 + OUT_W_PAD*4 + 1024;
45 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
46 ConvHx4ReluStreamMultiRow<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value)
48 adf::heap_size(k) = C/GROUP*((KH*KW+3)/4*4) *4 + OUT_W_PAD*4 + 1024;
52 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
53 Conv1x1ReluStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value) ||
55 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
56 Conv1x1Out4ReluStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value)
58 adf::heap_size(k) = (C/GROUP+3)/4*4 *4 + 1024;
60 else if ((std::is_same<
61 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
62 ConvHx4ReluPktStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value) ||
64 CONV<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>,
65 Conv1x1ReluPktStream<INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>::value)
67 adf::heap_size(k) = 31712;
115template <
template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
116 int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
117 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU,
118 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
123 std::vector<adf::kernel> pad;
124 static constexpr int PAD_H = INP_H + H0 + H1;
125 static constexpr int PAD_W = INP_W + W0 + W1;
128 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
130 adf::port<input> pin[1];
131 adf::port<output> pout[1];
134 std::vector<float> weights,
135 std::vector<float> bias,
138 static_assert(B*C*PAD_H*PAD_W*4 <= MAX_PARAM_BYTES);
139 assert(weights.size()*4 <= MAX_PARAM_BYTES);
140 static_assert(B*M*OUT_H*OUT_W_PAD*4 <= MAX_PARAM_BYTES);
142 k[0] = adf::kernel::create_object<CONV<PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(weights, bias);
143 adf::source(k[0]) =
"conv.cc";
144 adf::headers(k[0]) = {
"conv.h"};
145 adf::runtime<ratio>(k[0]) = 0.6;
146 adf::repetition_count(k[0]) = repeat_cnt;
148 if (H0+H1+W0+W1 != 0) {
151 adf::source(pad[0]) =
"pad.cc";
152 adf::headers(pad[0]) = {
"pad.h"};
153 adf::runtime<ratio>(pad[0]) = 0.6;
154 adf::repetition_count(pad[0]) = repeat_cnt;
156 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>, adf::stream> (pin[0], pad[0].in[0]);
157 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W*4>> (pad[0].out[0], k[0].in[0]);
159 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
160 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
162 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>> (pin[0], k[0].in[0]);
165 adf::connect<adf::window<B*M*OUT_H*OUT_W_PAD*4>> (k[0].out[0], pout[0]);
167 adf::location_constraint tilePos = adf::location<adf::kernel>(k[0]);
168 adf::location<adf::parameter>(k[0].param[0]) = tilePos;
169 adf::location<adf::parameter>(k[0].param[0]) = adf::offset(0);
170 adf::location<adf::parameter>(k[0].param[1]) = tilePos;
173 adf::location<adf::parameter>(k[0].param[1]) = adf::offset((weights.size()*4+31)/32*32);
188template <
template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
189 int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
190 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU,
191 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
196 std::vector<adf::kernel> pad;
197 static constexpr int PAD_H = INP_H + H0 + H1;
198 static constexpr int PAD_W = INP_W + W0 + W1;
201 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
203 adf::port<input> pin[2];
204 adf::port<output> pout[1];
207 std::vector<float> bias
209 static_assert(B*C*PAD_H*PAD_W*4 <= TILE_BYTES);
211 k[0] = adf::kernel::create_object<CONV<PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
212 adf::source(k[0]) =
"conv.cc";
213 adf::headers(k[0]) = {
"conv.h"};
214 adf::runtime<ratio>(k[0]) = 0.6;
215 adf::single_buffer(k[0].in[0]);
217 set_heap_size<CONV,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[0]);
219 if (H0+H1+W0+W1 != 0) {
222 adf::source(pad[0]) =
"pad.cc";
223 adf::headers(pad[0]) = {
"pad.h"};
224 adf::runtime<ratio>(pad[0]) = 0.6;
226 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
227 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W*4>> (pad[0].out[0], k[0].in[0]);
229 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
230 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
233 adf::connect<adf::stream, adf::window<B*C*INP_H*INP_W_PAD*4>> (pin[0], k[0].in[0]);
236 adf::connect<adf::stream> (pin[1], k[0].in[1]);
237 adf::connect<adf::stream> (k[0].out[0], pout[0]);
239 adf::samples_per_iteration(k[0].out[0]) = B*M*OUT_H*OUT_W_PAD;
241 adf::location<adf::buffer>(k[0].in[0]) = adf::location<adf::kernel>(k[0]);
242 adf::location<adf::buffer>(k[0].in[0]) = adf::offset(0);
260 template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
261 template<
typename,
int,
int,
int,
int>
class CONCAT,
262 int IS_BCHW,
int MCHUNK,
263 int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
264 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU,
265 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
269 static constexpr int CHUNK_COUNT = (M + MCHUNK - 1) / MCHUNK;
270 static constexpr int CHUNK_REM = M % MCHUNK;
272 static constexpr int PAD_H = INP_H + H0 + H1;
273 static constexpr int PAD_W = INP_W + W0 + W1;
275 adf::relative_coordinate tileOffsets[8] = {
276 {.col_offset = -1, .row_offset = 0},
277 {.col_offset = 1, .row_offset = 0},
278 {.col_offset = -1, .row_offset = 1},
279 {.col_offset = 0, .row_offset = 1},
280 {.col_offset = 1, .row_offset = 1},
281 {.col_offset = -1, .row_offset = -1},
282 {.col_offset = 0, .row_offset = -1},
283 {.col_offset = 1, .row_offset = -1},
286 adf::kernel k[CHUNK_COUNT];
287 std::vector<adf::kernel> pad;
290 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
291 static constexpr int CONCAT_W = (IS_BCHW) ? MCHUNK*OUT_H*OUT_W_PAD : MCHUNK;
292 static constexpr int CONCAT_BLOCK = (IS_BCHW) ? M*OUT_H*OUT_W_PAD : M;
293 static constexpr int CONCAT_H = (IS_BCHW) ? B : B*OUT_H*OUT_W_PAD;
296 adf::port<input> pin[1];
297 adf::port<output> pout[1];
300 std::vector<float> weights,
301 std::vector<float> bias
303 static_assert(CHUNK_COUNT <= 8);
304 static_assert(B*C*PAD_H*PAD_W*4 <= MAX_PARAM_BYTES);
305 assert(weights.size() <= MAX_PARAM_BYTES*8);
306 static_assert(B*M*OUT_H*OUT_W_PAD*4 <= MAX_PARAM_BYTES*8);
308 std::vector<float> wChunk;
309 std::vector<float> bChunk;
310 int CKK = weights.size() / M;
312 for (
int i = 0; i < CHUNK_COUNT; i++) {
313 int chunkSize = (i*MCHUNK + MCHUNK > M) ? CHUNK_REM : MCHUNK;
314 wChunk = std::vector<float>(weights.begin()+i*MCHUNK*CKK,
315 weights.begin()+(i*MCHUNK+chunkSize)*CKK);
316 wChunk.resize(MCHUNK*CKK, 0);
317 bChunk = std::vector<float>(bias.begin()+i*MCHUNK, bias.begin()+i*MCHUNK+chunkSize);
318 bChunk.resize(MCHUNK, 0);
320 k[i] = adf::kernel::create_object<CONV<PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,MCHUNK,KH,KW,GROUP,IS_RELU>>(wChunk, bChunk);
321 adf::source(k[i]) =
"conv.cc";
322 adf::headers(k[i]) = {
"conv.h"};
323 adf::runtime<ratio>(k[i]) = 0.6;
325 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(concat_g.k[0]) +
326 adf::relative_offset(tileOffsets[i]);
327 adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);
328 adf::location<adf::parameter>(k[i].param[0]) = tilePos;
329 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
330 adf::location<adf::parameter>(k[i].param[1]) = tilePos;
331 adf::location<adf::parameter>(k[i].param[1]) = adf::offset((MCHUNK*CKK*4+31)/32*32);
335 if (H0+H1+W0+W1 != 0) {
338 adf::source(pad[0]) =
"pad.cc";
339 adf::headers(pad[0]) = {
"pad.h"};
340 adf::runtime<ratio>(pad[0]) = 0.6;
342 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>, adf::stream> (pin[0], pad[0].in[0]);
343 for (
int i = 0; i < CHUNK_COUNT; i++)
344 adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W*4>> (pad[0].out[0], k[i].in[0]);
346 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
347 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
349 for (
int i = 0; i < CHUNK_COUNT; i++)
350 adf::connect<adf::window<B*C*INP_H*INP_W_PAD*4>> (pin[0], k[i].in[0]);
353 for (
int i = 0; i < CHUNK_COUNT; i++)
354 adf::connect<adf::window<B*MCHUNK*OUT_H*OUT_W_PAD*4>> (k[i].out[0], concat_g.pin[i]);
355 adf::connect<adf::stream> (concat_g.pout[0], pout[0]);
370 template<
typename,
int,
int,
int,
int>
class SPLIT,
371 template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
372 template<
typename,
int,
int,
int,
int>
class CONCAT,
374 int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
375 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU,
376 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
380 static constexpr int PAD_H = INP_H + H0 + H1;
381 static constexpr int PAD_W = INP_W + W0 + W1;
383 std::vector<adf::kernel> pad;
385 static constexpr int OVERLAP = KH-1;
388 static constexpr int LCNT = mSplitGraph::LCNT;
392 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
393 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
394 ConcatStreamGraph<CONCAT, float_t, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
396 adf::relative_coordinate tileOffsets[8] = {
397 {.col_offset = -1, .row_offset = 1},
398 {.col_offset = 0, .row_offset = 2},
399 {.col_offset = 0, .row_offset = 1},
400 {.col_offset = 1, .row_offset = 0},
401 {.col_offset = 0, .row_offset = -1},
402 {.col_offset = 0, .row_offset = -2},
403 {.col_offset = -1, .row_offset = -1},
404 {.col_offset = -1, .row_offset = 0},
407 adf::relative_coordinate concat_k1_offsets[4] = {
408 {.col_offset = -1, .row_offset = 2},
409 {.col_offset = 1, .row_offset = 1},
410 {.col_offset = 1, .row_offset = -1},
411 {.col_offset = -1, .row_offset = -2},
415 adf::port<adf::input> pin[2];
416 adf::port<adf::output> pout[1];
419 std::vector<float> bias
421 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
422 static_assert(LCNT <= 8);
423 static_assert(B*C*PAD_H*PAD_W*4 + B*C*(KH-1)*7*PAD_W*4 <= MAX_PARAM_BYTES*8);
425 if (H0+H1+W0+W1 != 0) {
428 adf::source(pad[0]) =
"pad.cc";
429 adf::headers(pad[0]) = {
"pad.h"};
430 adf::runtime<ratio>(pad[0]) = 0.6;
432 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
433 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
435 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
436 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
439 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
442 for (
int i = 0; i < LCNT; i++) {
443 k[i] = adf::kernel::create_object<CONV<HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
444 adf::source(k[i]) =
"conv.cc";
445 adf::headers(k[i]) = {
"conv.h"};
446 adf::runtime<ratio>(k[i]) = 0.6;
447 adf::single_buffer(k[i].in[0]);
449 set_heap_size<CONV,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[i]);
451 adf::connect<adf::window<B*C*HCHUNK*PAD_W*4>> (split_graph.pout[i], k[i].in[0]);
452 adf::connect<adf::stream> (pin[1], k[i].in[1]);
453 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
455 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
457 adf::location<adf::kernel>(k[i]) =
458 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(tileOffsets[i]);
459 adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);
460 adf::location<adf::parameter>(k[i].param[0]) = tilePos;
461 adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);
463 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
465 for (
int i = 0; i < concat_graph.k1.size(); i++) {
466 adf::location<adf::kernel>(concat_graph.k1[i]) =
467 adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(concat_k1_offsets[i]);
483 template<
typename,
int,
int,
int,
int>
class SPLIT,
484 template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
485 template<
typename,
int,
int,
int,
int>
class CONCAT,
487 int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
488 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU,
489 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
493 static constexpr int PAD_H = INP_H + H0 + H1;
494 static constexpr int PAD_W = INP_W + W0 + W1;
496 std::vector<adf::kernel> pad;
498 static constexpr int OVERLAP = KH-1;
499 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
500 adf::kernel split[(LCNT+1)/2];
503 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
504 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
505 ConcatStreamGraph<CONCAT, float_t, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
508 adf::port<adf::input> pin[2];
509 adf::port<adf::output> pout[1];
512 std::vector<float> bias
514 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
516 for (
int i = 0; i < LCNT/2; i++) {
517 split[i] = adf::kernel::create_object<SplitFilterFloatStreamTwice<float_t, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(i*2);
518 adf::source(split[i]) =
"split.cc";
519 adf::headers(split[i]) = {
"split.h"};
520 adf::runtime<ratio>(split[i]) = 0.6;
522 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
523 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
524 adf::samples_per_iteration(split[i].out[1]) = B*C*HCHUNK*PAD_W;
526 if ((LCNT & 0x1) == 1) {
527 int i = (LCNT+1)/2 - 1;
528 split[i] = adf::kernel::create_object<SplitFilterFloatStream<float_t, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(LCNT-1);
529 adf::source(split[i]) =
"split.cc";
530 adf::headers(split[i]) = {
"split.h"};
531 adf::runtime<ratio>(split[i]) = 0.6;
533 adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;
534 adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;
537 for (
int i = 0; i < LCNT; i++) {
538 k[i] = adf::kernel::create_object<CONV<HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
539 adf::source(k[i]) =
"conv.cc";
540 adf::headers(k[i]) = {
"conv.h"};
541 adf::runtime<ratio>(k[i]) = 0.6;
542 adf::single_buffer(k[i].in[0]);
544 set_heap_size<CONV,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[i]);
546 adf::connect<adf::stream, adf::window<B*C*HCHUNK*PAD_W*4>> (split[i/2].out[i&0x1], k[i].in[0]);
547 adf::connect<adf::stream> (pin[1], k[i].in[1]);
548 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
550 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
552 adf::location<adf::buffer>(k[i].in[0]) = adf::location<adf::kernel>(k[i]);
553 adf::location<adf::buffer>(k[i].in[0]) = {adf::offset(0)};
556 for (
int i = 0; i < (LCNT+1)/2; i++) {
557 adf::location<adf::kernel>(split[i]) =
558 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=1, .row_offset=0});
560 adf::location_constraint sTilePos = adf::location<adf::kernel>(split[i]);
561 adf::location<adf::stack>(split[i]) = sTilePos;
562 adf::location<adf::stack>(k[i*2]) = sTilePos;
563 adf::location<adf::parameter>(k[i*2].param[0]) = sTilePos;
566 adf::location<adf::kernel>(k[i*2+1]) = sTilePos + adf::relative_offset({.col_offset=0, .row_offset=1});
572 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
574 if (H0+H1+W0+W1 != 0) {
577 adf::source(pad[0]) =
"pad.cc";
578 adf::headers(pad[0]) = {
"pad.h"};
579 adf::runtime<ratio>(pad[0]) = 0.6;
581 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
582 for (
int i = 0; i < (LCNT+1)/2; i++)
583 adf::connect<adf::stream> (pad[0].out[0], split[i].in[0]);
585 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
586 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
589 for (
int i = 0; i < (LCNT+1)/2; i++)
590 adf::connect<adf::stream> (pin[0], split[i].in[0]);
593 for (
int i = 0; i < concat_graph.k1.size(); i++) {
594 adf::location<adf::kernel>(concat_graph.k1[i]) =
595 adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});
597 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
598 adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;
599 adf::location<adf::stack>(k[i*2+1]) = cTilePos;
600 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
616 template<
typename,
int,
int,
int,
int>
class SPLIT,
617 template<
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int,
int>
class CONV,
618 template<
typename,
int,
int,
int,
int>
class CONCAT,
620 int INP_H,
int INP_W,
int INP_W_PAD,
int OUT_W,
int OUT_W_PAD,
int STEP_H,
int STEP_W,
621 int B,
int C,
int M,
int KH,
int KW,
int GROUP,
int IS_RELU,
622 int H0 = 0,
int H1 = 0,
int W0 = 0,
int W1 = 0>
626 static constexpr int PAD_H = INP_H + H0 + H1;
627 static constexpr int PAD_W = INP_W + W0 + W1;
629 std::vector<adf::kernel> pad;
631 static constexpr int OVERLAP = KH-1;
635 static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;
638 static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;
639 static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;
640 ConcatStreamGraph<CONCAT, float_t, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;
643 adf::port<adf::input> pin[2];
644 adf::port<adf::output> pout[2];
647 std::vector<float> bias
649 static_assert((HCHUNK % STEP_H) == (KH % STEP_H));
651 for (
int i = 0; i < LCNT; i++) {
652 k[i] = adf::kernel::create_object<CONV<HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>>(bias);
653 adf::source(k[i]) =
"conv.cc";
654 adf::headers(k[i]) = {
"conv.h"};
655 adf::runtime<ratio>(k[i]) = 0.6;
657 set_heap_size<CONV,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP,IS_RELU>(k[i]);
659 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
660 adf::connect<adf::stream> (pin[1], k[i].in[1]);
661 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
663 adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;
666 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
669 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=2});
671 adf::location<adf::stack>(k[i]) = adf::location<adf::kernel>(k[i]);
675 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
677 if (H0+H1+W0+W1 != 0) {
680 adf::source(pad[0]) =
"pad.cc";
681 adf::headers(pad[0]) = {
"pad.h"};
682 adf::runtime<ratio>(pad[0]) = 0.6;
684 adf::connect<adf::stream> (pin[0], pad[0].in[0]);
685 adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);
686 adf::connect<adf::stream> (pad[0].out[0], pout[1]);
688 adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;
689 adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;
692 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
695 for (
int i = 0; i < concat_graph.k1.size(); i++) {
696 adf::location<adf::kernel>(concat_graph.k1[i]) =
697 adf::location<adf::kernel>(k[i*2+1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
699 adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);
700 adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;
701 adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;
Graph wrapper for arbitrary concat kernel implementation and lanes.
Definition graph_concat.h:37
Vector stream implementation for OUT_W == 4 < 8, stores biases, requires KH==KW==1,...
Definition conv.h:450
Vector stream implementation for BCHW, stores biases, requires KH==KW==1, INP_W%4==0,...
Definition conv.h:536
Vector stream implementation for BCHW, stores biases, requires KH==KW==1, INP_W%4==0,...
Definition conv.h:409
Vector stream implementation for OUT_W == 4 < 8, stores biases, requires KW<=3, INP_W%4==0,...
Definition conv.h:368
Vector stream implementation for BCHW, stores biases, requires KW<=3, INP_W%4==0, OUT_W_PAD%(8|4)==0,...
Definition conv.h:490
Vector stream implementation for BCHW, stores biases, requires KH==KW==3, INP_W%4==0,...
Definition conv.h:325
Vector stream implementation for BCHW, stores biases, requires KW<=3, INP_W%4==0, OUT_W_PAD%(8|4)==0,...
Definition conv.h:281
Scalar stream implementation for BCHW, stores biases, requires GROUP==1, ConvHx8ReluStream<28,...
Definition conv.h:238
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_conv.h:377
Multiinstance graph that stores biases, chunks BCHW by H dimension.
Definition graph_conv.h:623
Multiinstance graph that stores biases, chunks BCHW by H dimension.
Definition graph_conv.h:490
Multiinstance graph that stores weights and biases, chunks MCKK weights by M dimension,...
Definition graph_conv.h:266
Single instance graph that stores weights and biases.
Definition graph_conv.h:119
Scalar stream implementation for BCHW, stores biases, requires GROUP==1, ConvReluScalarStream<26,...
Definition conv.h:202
Single instance graph that streams weights and biases, significantly slower.
Definition graph_conv.h:192
Vector implementation for Float Pad2D Pad2DStreamFloat<f,2,30,30,32,1,1,1,1> total = 2304.
Definition pad.h:23
Graph wrapper for two stream split.
Definition graph_split.h:185
Graph wrapper for arbitrary split kernel implementation and lanes.
Definition graph_split.h:37