41 adf::port<input> pin[1];
42 adf::port<output> pout[1];
45 k[0] = adf::kernel::create_object<TRANSPOSE<TT, B, H, W, C, PAD_W>>();
46 adf::source(k[0]) =
"transpose.cc";
47 adf::headers(k[0]) = {
"transpose.h"};
48 adf::runtime<ratio>(k[0]) = 0.6;
50 adf::connect<adf::window<B*H*PAD_W*C*
sizeof(TT)>> (pin[0], k[0].in[0]);
51 adf::connect<adf::window<B*H*W*C*
sizeof(TT)>> (k[0].out[0], pout[0]);
72 static constexpr int LCNT = H / HCHUNK;
73 adf::kernel split[(LCNT+1)/2];
75 ConcatStreamGraph<ConcatFloatStreamWithStall, float_t, LCNT, B*C, HCHUNK*W, H*W> concat_graph;
78 adf::port<input> pin[1];
79 adf::port<output> pout[1];
82 static_assert(H % HCHUNK == 0);
83 static_assert(LCNT <= 8);
84 static_assert(B*HCHUNK*W*C*4 <= TILE_BYTES);
86 for (
int i = 0; i < LCNT/2; i++) {
87 split[i] = adf::kernel::create_object<SplitFilterFloatStreamTwice<float_t, B, H*W*C, HCHUNK*W*C, 0>>(i*2);
88 adf::source(split[i]) =
"split.cc";
89 adf::headers(split[i]) = {
"split.h"};
90 adf::runtime<ratio>(split[i]) = 0.6;
92 adf::connect<adf::stream> (pin[0], split[i].in[0]);
94 adf::samples_per_iteration(split[i].in[0]) = B*H*W*C;
95 adf::samples_per_iteration(split[i].out[0]) = B*HCHUNK*W*C;
96 adf::samples_per_iteration(split[i].out[1]) = B*HCHUNK*W*C;
98 if ((LCNT & 0x1) == 1) {
99 int i = (LCNT+1)/2 - 1;
100 split[i] = adf::kernel::create_object<SplitFilterFloatStream<float_t, B, H*W*C, HCHUNK*W*C, 0>>(LCNT-1);
101 adf::source(split[i]) =
"split.cc";
102 adf::headers(split[i]) = {
"split.h"};
103 adf::runtime<ratio>(split[i]) = 0.6;
105 adf::connect<adf::stream> (pin[0], split[i].in[0]);
107 adf::samples_per_iteration(split[i].in[0]) = B*H*W*C;
108 adf::samples_per_iteration(split[i].out[0]) = B*HCHUNK*W*C;
111 for (
int i = 0; i < LCNT; i++) {
112 k[i] = adf::kernel::create_object<TRANSPOSE<TT, B, HCHUNK, W, C, PAD_W>>();
113 adf::source(k[i]) =
"transpose.cc";
114 adf::headers(k[i]) = {
"transpose.h"};
115 adf::runtime<ratio>(k[i]) = 0.6;
116 if (B*C*HCHUNK*W*4 > MAX_PARAM_BYTES)
117 adf::single_buffer(k[i].in[0]);
119 adf::connect<adf::window<B*HCHUNK*PAD_W*C*
sizeof(TT)>> (split[i/2].out[i&0x1], k[i].in[0]);
120 adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);
121 adf::samples_per_iteration(k[i].out[0]) = B*C*HCHUNK*W;
123 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);
146 static constexpr int LCNT = H / HCHUNK;
151 ConcatStreamGraph<CONCAT, TT, LCNT, B*C, HCHUNK*W, H*W> concat_graph;
154 adf::port<input> pin[1];
155 adf::port<output> pout[1];
158 static_assert(H % HCHUNK == 0);
159 static_assert(LCNT <= 8);
160 static_assert(B*HCHUNK*W*C*4 <= TILE_BYTES);
162 adf::connect<adf::stream> (pin[0], split_graph.pin[0]);
164 for (
int i = 0; i < LCNT; i++) {
165 k[i] = adf::kernel::create_object<TRANSPOSE<TT, B, HCHUNK, W, C, PAD_W>>();
166 adf::source(k[i]) =
"transpose.cc";
167 adf::headers(k[i]) = {
"transpose.h"};
168 adf::runtime<ratio>(k[i]) = 0.6;
169 if (B*C*HCHUNK*W*4 > MAX_PARAM_BYTES)
170 adf::single_buffer(k[i].out[0]);
172 adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);
173 adf::connect<adf::window<B*C*HCHUNK*W*
sizeof(TT)>> (k[i].out[0], concat_graph.pin[i]);
176 adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=1});
180 adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);