onnx2versal/graph__qlinearconv_8h_source.html

#ifndef __QLINEARCONV_GRAPH_H__

#define __QLINEARCONV_GRAPH_H__


#include <adf.h>

#include "qlinearconv.h"

#include "pad.h"

#include "split.h"

#include "graph_concat.h"

#include "graph_split.h"

#include "graph_utils.h"


template <template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP>

void set_heap_size(adf::kernel k) {

  if (

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvScalarStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx4Stream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx4StreamScale32bit<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx6x8bitStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value)

  ) {

    adf::heap_size(k) = C/GROUP*((KH*KW+15)/16*16) + 1024; // caches CKK weights

  }

  else if (

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx4PktStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx4Stream_0<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx4Stream_1<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx4Stream_2<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConv1x1InputPackets<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConv1x1StreamInputPackets<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value) ||

    (std::is_same<

    QLINEARCONV<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>,

    QLinearConvHx8PktStream<TT,TTPARAM,INP_H,INP_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>>::value)

  ) {

    adf::heap_size(k) = 31712; // caches CKK weights, input window

  }

}


template <

  template<typename, int, int, int, int, int, int, int, int> class PAD,

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP,

  int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>


class QLinearConvGraph : public adf::graph {


  private:

    adf::kernel k[1];

    std::vector<adf::kernel> pad;

    static constexpr int PAD_H = INP_H + H0 + H1;

    static constexpr int PAD_W = INP_W + W0 + W1;


  public:

    adf::port<input> pin[1];

    adf::port<output> pout[1];

    static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;


    QLinearConvGraph(

      std::vector<TTPARAM> weights,

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert(B*C*PAD_H*PAD_W <= MAX_PARAM_BYTES);

      assert(weights.size() <= MAX_PARAM_BYTES);

      static_assert(B*M*OUT_H*OUT_W_PAD <= MAX_PARAM_BYTES);


      k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

        weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

      adf::source(k[0]) = "qlinearconv.cc";

      adf::headers(k[0]) = {"qlinearconv.h"};

      adf::runtime<ratio>(k[0]) = 0.6;


      if (H0+H1+W0+W1 != 0) {

        pad.push_back(

          adf::kernel::create_object<PAD<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));

        adf::source(pad[0]) = "pad.cc";

        adf::headers(pad[0]) = {"pad.h"};

        adf::runtime<ratio>(pad[0]) = 0.6;


        adf::connect<adf::stream> (pin[0], pad[0].in[0]);

        adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W>> (pad[0].out[0], k[0].in[0]);

      } else {

        adf::connect<adf::window<B*C*INP_H*INP_W_PAD>> (pin[0], k[0].in[0]);

      }


      adf::connect<adf::window<B*M*OUT_H*OUT_W_PAD>> (k[0].out[0], pout[0]);


      adf::location_constraint tilePos = adf::location<adf::kernel>(k[0]);

      adf::location<adf::parameter>(k[0].param[0]) = tilePos;

      adf::location<adf::parameter>(k[0].param[0]) = adf::offset(0);

      adf::location<adf::parameter>(k[0].param[1]) = tilePos;

    }


};


template <

  template<typename, int, int, int, int, int, int, int, int> class PAD,

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP,

  int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>


class QLinearConvStreamGraph : public adf::graph {


  private:

    adf::kernel k[1];

    std::vector<adf::kernel> pad;

    static constexpr int PAD_H = INP_H + H0 + H1;

    static constexpr int PAD_W = INP_W + W0 + W1;


  public:

    static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;


    adf::vector<adf::port<input>> pin;

    adf::port<output> pout[1];


    void init_helper(TT x_zero) {

      adf::source(k[0]) = "qlinearconv.cc";

      adf::headers(k[0]) = {"qlinearconv.h"};

      adf::runtime<ratio>(k[0]) = 0.6;

      if (B*C*PAD_H*PAD_W > MAX_PARAM_BYTES)

        adf::single_buffer(k[0].in[0]);


      set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[0]);


      if (H0+H1+W0+W1 != 0) {

        pad.push_back(

          adf::kernel::create_object<PAD<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));

        adf::source(pad[0]) = "pad.cc";

        adf::headers(pad[0]) = {"pad.h"};

        adf::runtime<ratio>(pad[0]) = 0.6;


        adf::connect<adf::stream> (pin[0], pad[0].in[0]);

        adf::connect<adf::stream, adf::window<B*C*PAD_H*PAD_W>> (pad[0].out[0], k[0].in[0]);


        adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;

        adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;


        adf::location<adf::kernel>(pad[0]) = adf::location<adf::kernel>(k[0]) +

          adf::relative_offset({.col_offset=0, .row_offset=1});


        adf::location_constraint padTile = adf::location<adf::kernel>(pad[0]);

        adf::location<adf::stack>(pad[0]) = padTile;

        adf::location<adf::stack>(k[0]) = padTile;

        adf::location<adf::parameter>(k[0].param[0]) = padTile;

      } else {

        adf::connect<adf::window<B*C*INP_H*INP_W_PAD>> (pin[0], k[0].in[0]);

      }


      adf::connect<adf::stream> (k[0].out[0], pout[0]);

      adf::samples_per_iteration(k[0].out[0]) = B*M*OUT_H*OUT_W_PAD;


      if (B*C*PAD_H*PAD_W > MAX_PARAM_BYTES) {

        adf::location<adf::buffer>(k[0].in[0]) = {adf::offset(0)};

      } else {

        adf::location<adf::buffer>(k[0].in[0]) = {adf::offset(0), adf::offset(16384)};

      }

    }


    QLinearConvStreamGraph(

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert(B*C*PAD_H*PAD_W <= TILE_BYTES);

      k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

        bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);


      adf::port<adf::input> pin0;

      adf::port<adf::input> pin1;

      pin.push_back(pin0);

      pin.push_back(pin1);


      adf::connect<adf::stream> (pin[1], k[0].in[1]); // variable samples per iteration based on kernel


      init_helper(x_zero);

    }


    QLinearConvStreamGraph(

      std::vector<TTPARAM> weights,

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert(B*C*PAD_H*PAD_W <= TILE_BYTES);


      k[0] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

        weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);


      adf::port<adf::input> pin0;

      pin.push_back(pin0);


      init_helper(x_zero);

    }


};


template <

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,

  template<typename, int, int, int, int> class CONCAT,

  int HCHUNK,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP,

  int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>


class QLinearConvChunkHGraph : public adf::graph {


  private:

    static constexpr int PAD_H = INP_H + H0 + H1;

    static constexpr int PAD_W = INP_W + W0 + W1;


    std::vector<adf::kernel> pad;


    static constexpr int OVERLAP = KH-STEP_H;

    typedef SplitGraph<SplitInt8, TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W> mSplitGraph;

    mSplitGraph split_graph;

    static constexpr int LCNT = mSplitGraph::LCNT;


    adf::kernel k[LCNT];


    static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;

    static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;

    ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;


    adf::relative_coordinate tileOffsets[8] = {

      {.col_offset = -1, .row_offset = 1}, // top left, clockwise

      {.col_offset = 0, .row_offset = 2},

      {.col_offset = 0, .row_offset = 1},

      {.col_offset = 1, .row_offset = 0},

      {.col_offset = 0, .row_offset = -1},

      {.col_offset = 0, .row_offset = -2},

      {.col_offset = -1, .row_offset = -1},

      {.col_offset = -1, .row_offset = 0},

    };


    adf::relative_coordinate concat_k1_offsets[4] = {

      {.col_offset = -1, .row_offset = 2}, // top left, clockwise

      {.col_offset = 1, .row_offset = 1},

      {.col_offset = 1, .row_offset = -1},

      {.col_offset = -1, .row_offset = -2},

    };


  public:

    adf::port<adf::input> pin[2];

    adf::port<adf::output> pout[1];


    QLinearConvChunkHGraph(

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert((HCHUNK % STEP_H) == (KH % STEP_H));

      static_assert(LCNT <= 8);

      static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);


      if (H0+H1+W0+W1 != 0) {

        pad.push_back(

          adf::kernel::create_object<Pad2DStreamInt8<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));

        adf::source(pad[0]) = "pad.cc";

        adf::headers(pad[0]) = {"pad.h"};

        adf::runtime<ratio>(pad[0]) = 0.6;


        adf::connect<adf::stream> (pin[0], pad[0].in[0]);

        adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);


        adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;

        adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;

        // split and pad can't be placed on same tile due to stream co-placement constraints

      } else {

        adf::connect<adf::stream> (pin[0], split_graph.pin[0]);

      }


      for (int i = 0; i < LCNT; i++) {

        k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

          bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

        adf::source(k[i]) = "qlinearconv.cc";

        adf::headers(k[i]) = {"qlinearconv.h"};

        adf::runtime<ratio>(k[i]) = 0.6;

        if (B*C*HCHUNK*PAD_W > MAX_PARAM_BYTES)

          adf::single_buffer(k[i].in[0]);


        set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);


        adf::connect<adf::window<B*C*HCHUNK*PAD_W>> (split_graph.pout[i], k[i].in[0]);

        adf::connect<adf::stream>                   (pin[1], k[i].in[1]);

        adf::connect<adf::stream>                   (k[i].out[0], concat_graph.pin[i]);


        adf::location<adf::kernel>(k[i]) =

          adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(tileOffsets[i]);

        adf::location_constraint tilePos = adf::location<adf::kernel>(k[i]);

        adf::location<adf::parameter>(k[i].param[0]) = tilePos; // may bust tiles adjacent to split

        adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);

      }

      adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);


      for (int i = 0; i < concat_graph.k1.size(); i++) {

        adf::location<adf::kernel>(concat_graph.k1[i]) =

          adf::location<adf::kernel>(split_graph.k[0]) + adf::relative_offset(concat_k1_offsets[i]);

      }


    }


};


template <

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,

  template<typename, int, int, int, int> class CONCAT,

  int HCHUNK,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP,

  int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>


class QLinearConvChunkHStreamGraph : public adf::graph {


  private:

    static constexpr int PAD_H = INP_H + H0 + H1;

    static constexpr int PAD_W = INP_W + W0 + W1;


    std::vector<adf::kernel> pad;


    static constexpr int OVERLAP = KH-STEP_H;

    static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;

    adf::kernel split[(LCNT+1)/2];

    adf::kernel k[LCNT];


    static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;

    static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;

    ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;


  public:

    adf::port<adf::input> pin[2];

    adf::port<adf::output> pout[1];


    QLinearConvChunkHStreamGraph(

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert((HCHUNK - KH + 1) % STEP_H == 0);

      static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);


      for (int i = 0; i < LCNT/2; i++) {

        split[i] = adf::kernel::create_object<SplitFilterInt8StreamTwice<TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(i*2);

        adf::source(split[i]) = "split.cc";

        adf::headers(split[i]) = {"split.h"};

        adf::runtime<ratio>(split[i]) = 0.6;


        adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;

        adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;

        adf::samples_per_iteration(split[i].out[1]) = B*C*HCHUNK*PAD_W;

      }

      if ((LCNT & 0x1) == 1) {

        int i = (LCNT+1)/2 - 1;

        split[i] = adf::kernel::create_object<SplitFilterInt8Stream<TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W>>(LCNT-1);

        adf::source(split[i]) = "split.cc";

        adf::headers(split[i]) = {"split.h"};

        adf::runtime<ratio>(split[i]) = 0.6;


        adf::samples_per_iteration(split[i].in[0]) = B*C*PAD_H*PAD_W;

        adf::samples_per_iteration(split[i].out[0]) = B*C*HCHUNK*PAD_W;

      }


      for (int i = 0; i < LCNT; i++) {

        k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

          bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

        adf::source(k[i]) = "qlinearconv.cc";

        adf::headers(k[i]) = {"qlinearconv.h"};

        adf::runtime<ratio>(k[i]) = 0.6;

        if (B*C*HCHUNK*PAD_W > MAX_PARAM_BYTES)

          adf::single_buffer(k[i].in[0]);


        set_heap_size<QLINEARCONV,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);


        adf::connect<adf::window<B*C*HCHUNK*PAD_W>> (split[i/2].out[i&0x1], k[i].in[0]);

        adf::connect<adf::stream>                   (pin[1], k[i].in[1]);

        adf::connect<adf::stream>                   (k[i].out[0], concat_graph.pin[i]);


        if ((i & 0x1) != 0) {

          adf::location<adf::kernel>(k[i]) =

            adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=1, .row_offset=0});

          adf::location<adf::kernel>(split[i/2]) =

            adf::location<adf::kernel>(k[i]) + adf::relative_offset({.col_offset=0, .row_offset=-1});


          adf::location_constraint sTilePos = adf::location<adf::kernel>(split[i/2]);

          adf::location<adf::stack>(split[i/2]) = sTilePos;

          adf::location<adf::stack>(k[i]) = sTilePos;

          adf::location<adf::parameter>(k[i].param[0]) = sTilePos;

          adf::location<adf::parameter>(k[i].param[0]) = adf::offset(0);

        }


        adf::location_constraint kTilePos = adf::location<adf::kernel>(k[i]);

      }

      adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);


      if (H0+H1+W0+W1 != 0) {

        pad.push_back(

          adf::kernel::create_object<Pad2DStreamInt8<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));

        adf::source(pad[0]) = "pad.cc";

        adf::headers(pad[0]) = {"pad.h"};

        adf::runtime<ratio>(pad[0]) = 0.6;


        adf::connect<adf::stream> (pin[0], pad[0].in[0]);

        for (int i = 0; i < (LCNT+1)/2; i++)

          adf::connect<adf::stream> (pad[0].out[0], split[i].in[0]);


        adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;

        adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;

        // split and pad can't be placed on same tile due to stream co-placement constraints

      } else {

        for (int i = 0; i < (LCNT+1)/2; i++)

          adf::connect<adf::stream> (pin[0], split[i].in[0]);

      }


      for (int i = 0; i < concat_graph.k1.size(); i++) {

        adf::location<adf::kernel>(concat_graph.k1[i]) =

          adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});


        adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);

        adf::location<adf::parameter>(k[i*2].param[0]) = cTilePos;

        adf::location<adf::parameter>(k[i*2].param[0]) = adf::offset(0);

        adf::location<adf::stack>(k[i*2]) = cTilePos;

        adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;

      }

    }


};


template <

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV,

  template<typename, int, int, int, int> class CONCAT,

  int HCHUNK,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP,

  int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>


class QLinearConvChunkHPktStreamGraph : public adf::graph {


  private:

    static constexpr int PAD_H = INP_H + H0 + H1;

    static constexpr int PAD_W = INP_W + W0 + W1;


    std::vector<adf::kernel> pad;


    static constexpr int OVERLAP = KH-STEP_H;

    typedef SplitFilterPktStreamGraph<SplitFilterInt8PktStream, TT, B*C, PAD_H*PAD_W, HCHUNK*PAD_W, OVERLAP*PAD_W> mSplitGraph;

    mSplitGraph split_graph;


    static constexpr int LCNT = (PAD_H - HCHUNK) / (HCHUNK - OVERLAP) + 1;

    adf::kernel k[LCNT];


    static constexpr int HCHUNK_OUT = (HCHUNK - KH) / STEP_H + 1;

    static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;

    ConcatStreamGraph<CONCAT, TT, LCNT, B*M, HCHUNK_OUT*OUT_W_PAD, OUT_H*OUT_W_PAD> concat_graph;


  public:

    std::vector<adf::port<adf::input>> pin;

    adf::port<adf::output> pout[1];


    void init_helper(TT x_zero) {

      for (int i = 0; i < LCNT; i++) {

        adf::source(k[i]) = "qlinearconv.cc";

        adf::headers(k[i]) = {"qlinearconv.h"};

        adf::runtime<ratio>(k[i]) = 0.6;


        set_heap_size<QLINEARCONV,TT,TTPARAM,HCHUNK,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,C,M,KH,KW,GROUP>(k[i]);


        adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);

        adf::connect<adf::stream> (k[i].out[0], concat_graph.pin[i]);

        adf::samples_per_iteration(k[i].out[0]) = B*M*HCHUNK_OUT*OUT_W_PAD;

      }


      adf::connect<adf::stream> (concat_graph.pout[0], pout[0]);


      if (H0+H1+W0+W1 != 0) {

        pad.push_back(

          adf::kernel::create_object<Pad2DStreamInt8<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));

        adf::source(pad[0]) = "pad.cc";

        adf::headers(pad[0]) = {"pad.h"};

        adf::runtime<ratio>(pad[0]) = 0.6;


        adf::connect<adf::stream> (pin[0], pad[0].in[0]);

        adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);


        adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;

        adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;

        // split and pad can't be placed on same tile due to stream co-placement constraints

      } else {

        adf::connect<adf::stream> (pin[0], split_graph.pin[0]);

      }


      // location constraints

      for (int i = 0; i < LCNT; i++) {

        if ((i&0x1) == 1)

          adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=0, .row_offset=2});

        if (i == 4)

          adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=-1, .row_offset=2});

        if (i == 2 || i == 6)

          adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-2]) + adf::relative_offset({.col_offset=1, .row_offset=0});

        adf::location<adf::stack>(k[i]) = adf::location<adf::kernel>(k[i]);

      }


      for (int i = 0; i < concat_graph.k1.size(); i++) {

        adf::location_constraint cTilePos = adf::location<adf::kernel>(concat_graph.k1[i]);

        adf::location<adf::stack>(concat_graph.k1[i]) = cTilePos;


        if (i < 2) {

          adf::location<adf::kernel>(concat_graph.k1[i]) =

            adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});

          adf::location<adf::parameter>(k[i*2].param[0]) = cTilePos;

        } else {

          adf::location<adf::kernel>(concat_graph.k1[i]) =

            adf::location<adf::kernel>(k[i*2]) + adf::relative_offset({.col_offset=0, .row_offset=1});

          adf::location<adf::parameter>(k[i*2+1].param[0]) = cTilePos;

        }

      }

      adf::location<adf::kernel>(split_graph.k[0]) = adf::location<adf::kernel>(k[1]) + adf::relative_offset({.col_offset=0, .row_offset=1});

    }


    QLinearConvChunkHPktStreamGraph(

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert((HCHUNK % STEP_H) == (KH % STEP_H));

      static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);


      adf::port<adf::input> pin0;

      adf::port<adf::input> pin1;

      pin.push_back(pin0);

      pin.push_back(pin1);


      for (int i = 0; i < LCNT; i++) {

        k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

          bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

        adf::connect<adf::stream>    (pin[1], k[i].in[1]);

      }

      init_helper(x_zero);

    }


    QLinearConvChunkHPktStreamGraph(

      std::vector<TTPARAM> weights,

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      static_assert((HCHUNK % STEP_H) == (KH % STEP_H));

      static_assert(B*C*HCHUNK*PAD_W <= TILE_BYTES);


      adf::port<adf::input> pin0;

      pin.push_back(pin0);


      for (int i = 0; i < LCNT; i++) {

        k[i] = adf::kernel::create_object<QLINEARCONV<TT, TTPARAM, HCHUNK, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, C, M, KH, KW, GROUP>>(

          weights, bias, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

      }

      init_helper(x_zero);

    }


};


template <

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV0,

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV1,

  template<typename, typename, int, int, int, int, int, int, int, int, int, int, int, int> class QLINEARCONV2,

  template<typename, int, int, int, int> class CONCAT,

  int CCHUNK,

  typename TT, typename TTPARAM, int INP_H, int INP_W, int INP_W_PAD, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W,

  int B, int C, int M, int KH, int KW, int GROUP,

  int H0 = 0, int H1 = 0, int W0 = 0, int W1 = 0>


class QLinearConvChunkCGraph : public adf::graph {


  private:

    static constexpr int PAD_H = INP_H + H0 + H1;

    static constexpr int PAD_W = INP_W + W0 + W1;


    std::vector<adf::kernel> pad;


    typedef SplitFilterPktStreamGraph<SplitFilterInt8PktStream, TT, B, C*PAD_H*PAD_W, CCHUNK*PAD_H*PAD_W, 0> mSplitGraph;

    mSplitGraph split_graph;


    static constexpr int OUT_H = (PAD_H - KH) / STEP_H + 1;


  public:

    static constexpr int LCNT = C / CCHUNK;

    adf::kernel k[LCNT];


    std::vector<adf::port<adf::input>> pin;

    adf::port<adf::output> pout[1];


    void init_helper(TT x_zero) {

      static_assert(LCNT >= 3);

      static_assert(C % CCHUNK == 0);

      static_assert(B*CCHUNK*PAD_H*PAD_W <= TILE_BYTES);


      for (int i = 0; i < LCNT; i++) {

        adf::source(k[i]) = "qlinearconv.cc";

        adf::headers(k[i]) = {"qlinearconv.h"};

        adf::runtime<ratio>(k[i]) = 0.6;


        if (i != 0) {

          adf::location<adf::kernel>(k[i]) = adf::location<adf::kernel>(k[i-1]) + adf::relative_offset({.col_offset=1});

        }

      }


      if (H0+H1+W0+W1 != 0) {

        pad.push_back(

          adf::kernel::create_object<Pad2DStreamInt8<TT, B*C, INP_H, INP_W, INP_W_PAD, H0, H1, W0, W1>>(x_zero));

        adf::source(pad[0]) = "pad.cc";

        adf::headers(pad[0]) = {"pad.h"};

        adf::runtime<ratio>(pad[0]) = 0.6;


        adf::connect<adf::stream> (pin[0], pad[0].in[0]);

        adf::connect<adf::stream> (pad[0].out[0], split_graph.pin[0]);


        adf::samples_per_iteration(pad[0].in[0]) = B*C*INP_H*INP_W_PAD;

        adf::samples_per_iteration(pad[0].out[0]) = B*C*PAD_H*PAD_W;

        // split and pad can't be placed on same tile due to stream co-placement constraints

      } else {

        adf::connect<adf::stream> (pin[0], split_graph.pin[0]);

      }

    }


    QLinearConvChunkCGraph(

      std::vector<TTPARAM> weights,

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      assert(weights.size() / LCNT <= TILE_BYTES); // weight size may vary based on padding done for given kernel


      adf::port<adf::input> pin0;

      pin.push_back(pin0);


      for (int i = 0; i < LCNT; i++) {

        std::vector<TTPARAM> wChunk; // build wChunk

        wChunk.reserve(weights.size() / LCNT);

        for (int m = 0; m < M; m++) {

          wChunk.insert(wChunk.end(),

            weights.begin() + m*weights.size()/M + i*weights.size()/M/LCNT,

            weights.begin() + m*weights.size()/M + (i+1)*weights.size()/M/LCNT);

        }


        if (i == 0) {

          k[i] = adf::kernel::create_object<QLINEARCONV0<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(

            wChunk, bias, w_zero);

        } else if (i == LCNT-1) {

          k[i] = adf::kernel::create_object<QLINEARCONV2<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(

            wChunk, x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

          adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[1]);

          adf::connect<adf::stream>  (k[i].out[0], pout[0]);

        } else {

          k[i] = adf::kernel::create_object<QLINEARCONV1<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(

            wChunk, w_zero);

          adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[1]);

        }


        adf::single_buffer(k[i].in[0]);

        adf::connect<adf::window<B*CCHUNK*PAD_H*PAD_W>> (split_graph.pout[i], k[i].in[0]);


        adf::location<adf::buffer>(k[i].in[0]) = adf::location<adf::kernel>(k[i]);

        adf::location<adf::buffer>(k[i].in[0]) = {adf::offset(0)};

      }


      init_helper(x_zero);

    }


    QLinearConvChunkCGraph(

      std::vector<int32_t> bias,

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ) {

      for (int i = 0; i < LCNT; i++) {


        adf::port<adf::input> pin0;

        pin.push_back(pin0);


        if (i == 0) {

          k[i] = adf::kernel::create_object<QLINEARCONV0<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(

            bias, w_zero);

          set_heap_size<QLINEARCONV0,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);

        } else if (i == LCNT-1) {

          k[i] = adf::kernel::create_object<QLINEARCONV2<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(

            x_scale, w_scale, y_scale, x_zero, w_zero, y_zero);

          adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[2]);

          adf::connect<adf::stream>  (k[i].out[0], pout[0]);

          set_heap_size<QLINEARCONV2,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);

        } else {

          k[i] = adf::kernel::create_object<QLINEARCONV1<TT, TTPARAM, PAD_H, PAD_W, OUT_W, OUT_W_PAD, STEP_H, STEP_W, B, CCHUNK, M, KH, KW, GROUP>>(

            w_zero);

          adf::connect<adf::cascade> (k[i-1].out[0], k[i].in[2]);

          set_heap_size<QLINEARCONV1,TT,TTPARAM,PAD_H,PAD_W,OUT_W,OUT_W_PAD,STEP_H,STEP_W,B,CCHUNK,M,KH,KW,GROUP>(k[0]);

        }


        adf::connect<adf::stream>    (pin[1+i], k[i].in[1]);

        adf::connect<adf::pktstream> (split_graph.pout[i], k[i].in[0]);

      }


      init_helper(x_zero);

    }


};


#endif // __QLINEARCONV_GRAPH_H__

Pad2DStreamInt8
Vector implementation for Int8 Pad2D Pad2DStreamInt8<a,2,30,30,32,1,1,1,1> total = 1885 for v64int16.
Definition pad.h:51

QLinearConv1x1InputPackets
Vector implementation for 1x1 QLinearConv, stores weights requires data to be reshaped from (M,...
Definition qlinearconv.h:1077

QLinearConv1x1StreamInputPackets
Vector implementation for 1x1 QLinearConv, streams weights requires data to be reshaped from (M,...
Definition qlinearconv.h:1140

QLinearConvChunkCGraph
Multiinstance graph that stores weights and biases, chunks BCHW by C dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:709

QLinearConvChunkHGraph
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:301

QLinearConvChunkHPktStreamGraph
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:557

QLinearConvChunkHStreamGraph
Multiinstance graph that stores weights and biases, chunks BCHW by H dimension, maximum 8 chunks.
Definition graph_qlinearconv.h:421

QLinearConvGraph
Single instance graph that stores weights and biases Max size = 16384 and 4096 bytes respectively.
Definition graph_qlinearconv.h:109

QLinearConvHx4PktStream
Vector implementation for Hx4 QLinearConv, padding with y_zero, requires data to be arranged in (M,...
Definition qlinearconv.h:450

QLinearConvHx4StreamScale32bit
Vector implementation for Hx4 QLinearConv using 32bit scale for precision, requires data to be arrang...
Definition qlinearconv.h:386

QLinearConvHx4Stream
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:320

QLinearConvHx6x8bitStream
Vector implementation for Hx4 QLinearConv using int8xint8 MACs, requires data to be arranged in [a,...
Definition qlinearconv.h:828

QLinearConvHx8PktStream
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:947

QLinearConvScalarStream
Scalar implementation streaming weights, requires weights stream to be padded from MxCxKxK to MxCx16,...
Definition qlinearconv.h:266

QLinearConvStreamGraph
Single instance graph that streams weights and biases, significantly slower.
Definition graph_qlinearconv.h:181

SplitFilterPktStreamGraph
Graph wrapper for two stream split.
Definition graph_split.h:185

SplitGraph
Graph wrapper for arbitrary split kernel implementation and lanes.
Definition graph_split.h:37