21template <
typename TT,
int LCNT,
int H,
int INP_W,
int OUT_W>
25 input_window<TT>* in0,
26 input_window<TT>* in1,
27 input_window<TT>* in2,
28 input_window<TT>* in3,
29 input_window<TT>* in4,
30 input_window<TT>* in5,
31 input_window<TT>* in6,
32 input_window<TT>* in7,
33 output_stream<TT>* out
36 input_window<TT>* in0,
37 input_window<TT>* in1,
38 input_window<TT>* in2,
39 input_window<TT>* in3,
40 input_window<TT>* in4,
41 input_window<TT>* in5,
42 input_window<TT>* in6,
43 output_stream<TT>* out
46 input_window<TT>* in0,
47 input_window<TT>* in1,
48 input_window<TT>* in2,
49 input_window<TT>* in3,
50 input_window<TT>* in4,
51 input_window<TT>* in5,
52 output_stream<TT>* out
55 input_window<TT>* in0,
56 input_window<TT>* in1,
57 input_window<TT>* in2,
58 input_window<TT>* in3,
59 input_window<TT>* in4,
60 output_stream<TT>* out
63 input_window<TT>* in0,
64 input_window<TT>* in1,
65 input_window<TT>* in2,
66 input_window<TT>* in3,
67 output_stream<TT>* out
70 input_window<TT>* in0,
71 input_window<TT>* in1,
72 input_window<TT>* in2,
73 output_stream<TT>* out
76 input_window<TT>* in0,
77 input_window<TT>* in1,
78 output_stream<TT>* out
81 input_window<TT>* in0,
82 output_stream<TT>* out
84 static void registerKernelClass() {
85 static_assert(
sizeof(TT) == 4);
87 REGISTER_FUNCTION(ConcatScalar::filter8);
88 }
else if (LCNT == 7) {
89 REGISTER_FUNCTION(ConcatScalar::filter7);
90 }
else if (LCNT == 6) {
91 REGISTER_FUNCTION(ConcatScalar::filter6);
92 }
else if (LCNT == 5) {
93 REGISTER_FUNCTION(ConcatScalar::filter5);
94 }
else if (LCNT == 4) {
95 REGISTER_FUNCTION(ConcatScalar::filter4);
96 }
else if (LCNT == 3) {
97 REGISTER_FUNCTION(ConcatScalar::filter3);
98 }
else if (LCNT == 2) {
99 REGISTER_FUNCTION(ConcatScalar::filter2);
100 }
else if (LCNT == 1) {
101 REGISTER_FUNCTION(ConcatScalar::filter1);
112template <
typename TT,
int LCNT,
int H,
int INP_W,
int OUT_W>
116 input_window<TT>* in0,
117 input_window<TT>* in1,
118 input_window<TT>* in2,
119 input_window<TT>* in3,
120 input_window<TT>* in4,
121 input_window<TT>* in5,
122 input_window<TT>* in6,
123 input_window<TT>* in7,
124 output_stream<TT>* out
127 input_window<TT>* in0,
128 input_window<TT>* in1,
129 input_window<TT>* in2,
130 input_window<TT>* in3,
131 input_window<TT>* in4,
132 input_window<TT>* in5,
133 input_window<TT>* in6,
134 output_stream<TT>* out
137 input_window<TT>* in0,
138 input_window<TT>* in1,
139 input_window<TT>* in2,
140 input_window<TT>* in3,
141 input_window<TT>* in4,
142 input_window<TT>* in5,
143 output_stream<TT>* out
146 input_window<TT>* in0,
147 input_window<TT>* in1,
148 input_window<TT>* in2,
149 input_window<TT>* in3,
150 input_window<TT>* in4,
151 output_stream<TT>* out
154 input_window<TT>* in0,
155 input_window<TT>* in1,
156 input_window<TT>* in2,
157 input_window<TT>* in3,
158 output_stream<TT>* out
161 input_window<TT>* in0,
162 input_window<TT>* in1,
163 input_window<TT>* in2,
164 output_stream<TT>* out
167 input_window<TT>* in0,
168 input_window<TT>* in1,
169 output_stream<TT>* out
172 input_window<TT>* in0,
173 output_stream<TT>* out
175 static void registerKernelClass() {
176 static_assert(INP_W%4==0 && OUT_W%4==0 && (std::is_same<TT, float>::value));
178 REGISTER_FUNCTION(ConcatFloat::filter8);
179 }
else if (LCNT == 7) {
180 REGISTER_FUNCTION(ConcatFloat::filter7);
181 }
else if (LCNT == 6) {
182 REGISTER_FUNCTION(ConcatFloat::filter6);
183 }
else if (LCNT == 5) {
184 REGISTER_FUNCTION(ConcatFloat::filter5);
185 }
else if (LCNT == 4) {
186 REGISTER_FUNCTION(ConcatFloat::filter4);
187 }
else if (LCNT == 3) {
188 REGISTER_FUNCTION(ConcatFloat::filter3);
189 }
else if (LCNT == 2) {
190 REGISTER_FUNCTION(ConcatFloat::filter2);
191 }
else if (LCNT == 1) {
192 REGISTER_FUNCTION(ConcatFloat::filter1);
203template <
typename TT,
int LCNT,
int H,
int INP_W,
int OUT_W>
207 input_window<TT>* in0,
208 input_window<TT>* in1,
209 input_window<TT>* in2,
210 input_window<TT>* in3,
211 input_window<TT>* in4,
212 input_window<TT>* in5,
213 input_window<TT>* in6,
214 input_window<TT>* in7,
215 output_stream<TT>* out
218 input_window<TT>* in0,
219 input_window<TT>* in1,
220 input_window<TT>* in2,
221 input_window<TT>* in3,
222 input_window<TT>* in4,
223 input_window<TT>* in5,
224 input_window<TT>* in6,
225 output_stream<TT>* out
228 input_window<TT>* in0,
229 input_window<TT>* in1,
230 input_window<TT>* in2,
231 input_window<TT>* in3,
232 input_window<TT>* in4,
233 input_window<TT>* in5,
234 output_stream<TT>* out
237 input_window<TT>* in0,
238 input_window<TT>* in1,
239 input_window<TT>* in2,
240 input_window<TT>* in3,
241 input_window<TT>* in4,
242 output_stream<TT>* out
245 input_window<TT>* in0,
246 input_window<TT>* in1,
247 input_window<TT>* in2,
248 input_window<TT>* in3,
249 output_stream<TT>* out
252 input_window<TT>* in0,
253 input_window<TT>* in1,
254 input_window<TT>* in2,
255 output_stream<TT>* out
258 input_window<TT>* in0,
259 input_window<TT>* in1,
260 output_stream<TT>* out
263 input_window<TT>* in0,
264 output_stream<TT>* out
266 static void registerKernelClass() {
267 static_assert(INP_W%16==0);
268 static_assert(OUT_W%16==0);
269 static_assert(std::is_same<TT, int8_t>::value || std::is_same<TT, uint8_t>::value);
271 REGISTER_FUNCTION(ConcatInt8::filter8);
272 }
else if (LCNT == 7) {
273 REGISTER_FUNCTION(ConcatInt8::filter7);
274 }
else if (LCNT == 6) {
275 REGISTER_FUNCTION(ConcatInt8::filter6);
276 }
else if (LCNT == 5) {
277 REGISTER_FUNCTION(ConcatInt8::filter5);
278 }
else if (LCNT == 4) {
279 REGISTER_FUNCTION(ConcatInt8::filter4);
280 }
else if (LCNT == 3) {
281 REGISTER_FUNCTION(ConcatInt8::filter3);
282 }
else if (LCNT == 2) {
283 REGISTER_FUNCTION(ConcatInt8::filter2);
284 }
else if (LCNT == 1) {
285 REGISTER_FUNCTION(ConcatInt8::filter1);
295template <
typename TT,
int H,
int INP_W1,
int INP_W2,
int OUT_W>
299 input_stream<TT>* in0,
300 input_stream<TT>* in1,
301 output_stream<TT>* out
303 static void registerKernelClass() {
304 static_assert(
sizeof(TT) == 4);
306 REGISTER_FUNCTION(ConcatFloatStream::filter);
314template <
typename TT,
int H,
int INP_W1,
int INP_W2,
int OUT_W>
318 input_stream<TT>* in0,
319 input_stream<TT>* in1,
320 output_stream<TT>* out
322 static void registerKernelClass() {
323 static_assert(
sizeof(TT) == 4);
325 REGISTER_FUNCTION(ConcatFloatStreamWithStall::filter);
334template <
typename TT>
348 ): H(H), INP_W1(INP_W1), INP_W2(INP_W2), OUT_W(OUT_W) {};
351 input_stream<TT>* in0,
352 input_stream<TT>* in1,
353 output_stream<TT>* out
355 static void registerKernelClass() {
356 static_assert(
sizeof(TT) == 4);
358 REGISTER_FUNCTION(ConcatFloatStreamSequentially::filter);
359 REGISTER_PARAMETER(H);
360 REGISTER_PARAMETER(INP_W1);
361 REGISTER_PARAMETER(INP_W2);
362 REGISTER_PARAMETER(OUT_W);
371template <
typename TT,
int LCNT,
int H,
int INP_W,
int OUT_W>
376 output_stream<TT>* out
378 static void registerKernelClass() {
379 static_assert(
sizeof(TT) == 4);
381 REGISTER_FUNCTION(ConcatFloatPktStream::filter);
390template <
typename TT,
int H,
int INP_W1,
int INP_W2,
int OUT_W>
394 input_stream<TT>* in0,
395 input_stream<TT>* in1,
396 output_stream<TT>* out
398 static void registerKernelClass() {
399 static_assert(INP_W1 % 16 == 0 && INP_W2 % 16 == 0 && OUT_W % 16 == 0);
400 static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));
402 REGISTER_FUNCTION(ConcatInt8Stream::filter);
410template <
typename TT,
int LCNT,
int H,
int INP_W,
int OUT_W>
414 input_stream<TT>* in0,
415 input_stream<TT>* in1,
416 output_stream<TT>* out
418 static void registerKernelClass() {
419 static_assert(
sizeof(TT) == 4);
420 static_assert(OUT_W % INP_W == 0);
421 REGISTER_FUNCTION(ConcatTwo32bitStreams::filter);
Scalar implementation for stream concat, ConcatFloatPktStream<f,4,32,32,64> takes cycles.
Definition concat.h:372
Scalar implementation for stream concat, ConcatFloatStreamSequentially<f,4,32,32,64> takes ~1000 cycl...
Definition concat.h:335
Scalar implementation for stream concat,.
Definition concat.h:315
Scalar implementation for stream concat, ConcatFloatStream<f,4,32,32,64> takes ~1000 cycles.
Definition concat.h:296
Vector implementation, requires INP_W%4=0, OUT_W%4=0. ConcatFloat<f,5,4,32,144> takes 715 cycles (~30...
Definition concat.h:113
Vector implementation for stream concat with int8, ConcatInt8Stream<f,4,32,32,64> takes cycles.
Definition concat.h:391
Vector implementation for int8_t, requires INP_W%16=0, OUT_W%16=0, ConcatInt8<f,5,...
Definition concat.h:204
Scalar implementation, ConcatScalar<f,5,4,32,144> takes 5858 cycles (~850 for output window)
Definition concat.h:22
Scalar implementation for concatenating 2 chunked streams, ConcatTwo32bitStreams<f,...
Definition concat.h:411