onnx2versal/qlinearconv_8h_source.html

#ifndef QLINEARCONV_H_

#define QLINEARCONV_H_


#include <adf.h>

#include <assert.h>


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvScalar {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;


    alignas(32) TTPARAM (&weights)[M*C_PER_M*KH*KW];

    alignas(32) int32_t (&bias)[M];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    float scale;


  public:

    QLinearConvScalar (

      TTPARAM (&w)[M*C*KH*KW],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ): weights(w), bias(b), x_scale(x_scale), w_scale(w_scale), y_scale(y_scale), x_zero(x_zero), w_zero(w_zero), y_zero(y_zero) {

      scale = x_scale*w_scale/y_scale;

    };


        void filter(

            input_window<TT>* in,

            output_window<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value));

            REGISTER_FUNCTION(QLinearConvScalar::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConv5x5 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;


    alignas(32) TTPARAM (&weights)[M*C*KH*16];

    alignas(32) int32_t (&bias)[M];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConv5x5 (

      TTPARAM (&w)[M*C*KH*16],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

            output_window<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value));

      static_assert(KH==5);

      static_assert(KW==5);

      static_assert(GROUP == 1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

            REGISTER_FUNCTION(QLinearConv5x5::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConv5x5Scale32bit {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;


    alignas(32) TTPARAM (&weights)[M*C*KH*16];

    alignas(32) int32_t (&bias)[M];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int32_t scale;


  public:

    QLinearConv5x5Scale32bit (

      TTPARAM (&w)[M*C*KH*16],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

            output_window<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value));

      static_assert(GROUP == 1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

            REGISTER_FUNCTION(QLinearConv5x5Scale32bit::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConv3x3 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    alignas(32) TTPARAM (&weights)[M*C*16];

    alignas(32) int32_t (&bias)[M];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConv3x3 (

      TTPARAM (&w)[M*C*16],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

            output_window<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value));

      static_assert(KH==3);

      static_assert(KW==3);

      static_assert(GROUP == 1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

            REGISTER_FUNCTION(QLinearConv3x3::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvScalarStream {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    float scale;


  public:

    QLinearConvScalarStream (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    ): bias(b), x_scale(x_scale), w_scale(w_scale), y_scale(y_scale), x_zero(x_zero), w_zero(w_zero), y_zero(y_zero) {

      scale = x_scale*w_scale/y_scale;

    };


        void filter(

            input_window<TT>* in,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

            REGISTER_FUNCTION(QLinearConvScalarStream::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvHx4Stream {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx4Stream (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4Stream::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvHx4StreamScale32bit {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int32_t scale;


  public:

    QLinearConvHx4StreamScale32bit (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4StreamScale32bit::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvHx4PktStream {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    alignas(32) TT in[INP_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx4PktStream (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_pktstream* in_s,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4PktStream::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConvHx4_0 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    alignas(32) int32_t (&bias)[M];

    TTPARAM w_zero;


  public:

    QLinearConvHx4_0 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      int32_t (&b)[M],

      TTPARAM w_zero

    ): weights(w), bias(b), w_zero(w_zero) {}


        void filter(

            input_window<TT>* in,

            output_stream<acc48>* cout

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4_0::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConvHx4_1 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    TTPARAM w_zero;


  public:

    QLinearConvHx4_1 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      TTPARAM w_zero

    ): weights(w), w_zero(w_zero) {}


        void filter(

            input_window<TT>* in,

            input_stream<acc48>* cin,

            output_stream<acc48>* cout

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4_1::filter);

      REGISTER_PARAMETER(weights);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConvHx4_2 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx4_2 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

            input_stream<acc48>* cin,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4_2::filter);

      REGISTER_PARAMETER(weights);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConvHx4Stream_0 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    alignas(32) TT in[INP_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx4Stream_0 (

      int32_t (&b)[M],

      TTPARAM w_zero

    ): bias(b), w_zero(w_zero) {}


        void filter(

            input_pktstream* in_s,

      input_stream<TTPARAM>* weights,

            output_stream<acc48>* cout

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4Stream_0::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConvHx4Stream_1 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    alignas(32) TT in[INP_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx4Stream_1 (

      TTPARAM w_zero

    ): w_zero(w_zero) {}


        void filter(

            input_pktstream* in_s,

      input_stream<TTPARAM>* weights,

            input_stream<acc48>* cin,

            output_stream<acc48>* cout

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4Stream_1::filter);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConvHx4Stream_2 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : ((STEP_W == 2) ? 0x76543210 : 0xeca86420);

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP    = (STEP_W == 1) ? 16 : 8;

    static constexpr int W_LOOP_IN_STEP = (STEP_W != 4) ? 16 : 32;


    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    alignas(32) TT in[INP_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx4Stream_2 (

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_pktstream* in_s,

      input_stream<TTPARAM>* weights,

            input_stream<acc48>* cin,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=4);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2 || STEP_H == 4);

      static_assert(STEP_W == 1 || STEP_W == 2 || STEP_W == 4);

            REGISTER_FUNCTION(QLinearConvHx4Stream_2::filter);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvHx6x8bitStream {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*KH*16;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx6x8bitStream (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value));

      static_assert(KW<=6);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1);

      static_assert(STEP_W == 1);

            REGISTER_FUNCTION(QLinearConvHx6x8bitStream::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvHx8 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = C_PER_M*((KH*KW+15)/16*16);


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    alignas(32) int32_t (&bias)[M];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx8 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=8);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConvHx8::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConvHx8PktStream {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int CKK_ROW_SIZE = C*((KH*KW+15)/16*16);

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x2110 : 0x3210;


    // v32 data limits strides 2, 4 to compute 8 values, note KW <= 4

    static constexpr int W_LOOP_STEP = (STEP_W == 1) ? 16 : 8;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    alignas(32) int32_t (&bias)[M];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;

    alignas(32) TT in[INP_SIZE];


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConvHx8PktStream (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_pktstream* in,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KW<=8);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

      static_assert(GROUP == 1);

            REGISTER_FUNCTION(QLinearConvHx8PktStream::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConv1x1Stream {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;

    static constexpr int LAST_C = (C_PER_M % 16 - 1) /2*2;


    static constexpr unsigned int MAC_ZOFFSET = (STEP_W == 1) ? 0x43322110 : 0x76543210;

    static constexpr unsigned int MAC_ZSQUARE = (STEP_W == 1) ? 0x3120 : 0x3210;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConv1x1Stream (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_window<TT>* in,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KH==1);

      static_assert(KW==1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConv1x1Stream::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConv1x1InputPackets {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;

    static constexpr int LAST_C = (C_PER_M % 16 - 1) /2*2;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    alignas(32) int32_t (&bias)[M];

    alignas(32) TT in[INP_SIZE];


    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConv1x1InputPackets (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_pktstream* in_s,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KH==1);

      static_assert(KW==1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConv1x1InputPackets::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>


class QLinearConv1x1StreamInputPackets {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int INP_SIZE = B*C*INP_H*INP_W;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;

    static constexpr int LAST_C = (C_PER_M % 16 - 1) /2*2;


    alignas(32) int32_t (&bias)[M];

    alignas(32) TTPARAM ckk_row[CKK_ROW_SIZE];

    alignas(32) TT in[INP_SIZE];


    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConv1x1StreamInputPackets (

      int32_t (&b)[M],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

            input_pktstream* in_s,

      input_stream<TTPARAM>* weights,

            output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KH==1);

      static_assert(KW==1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConv1x1StreamInputPackets::filter);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConv1x1_0 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;

    static constexpr int LAST_C = (C_PER_M % 16) / 2;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    alignas(32) int32_t (&bias)[M];

    TTPARAM w_zero;


  public:

    QLinearConv1x1_0 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      int32_t (&b)[M],

      TTPARAM w_zero

    ): weights(w), bias(b), w_zero(w_zero) {}


        void filter(

        input_window<TT>* in,

      output_stream<acc48>* cout

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KH==1);

      static_assert(KW==1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConv1x1_0::filter);

      REGISTER_PARAMETER(weights);

      REGISTER_PARAMETER(bias);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConv1x1_1 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;

    static constexpr int LAST_C = (C_PER_M % 16) / 2;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    TTPARAM w_zero;


  public:

    QLinearConv1x1_1 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      TTPARAM w_zero

    ): weights(w), w_zero(w_zero) {}


        void filter(

        input_window<TT>* in,

      input_stream<acc48>* cin,

      output_stream<acc48>* cout

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KH==1);

      static_assert(KW==1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConv1x1_1::filter);

      REGISTER_PARAMETER(weights);

        }

};


template <typename TT, typename TTPARAM, int INP_H, int INP_W, int OUT_W, int OUT_W_PAD, int STEP_H, int STEP_W, int B, int C, int M, int KH, int KW, int GROUP>

class QLinearConv1x1_2 {


  private:

    static constexpr int OUT_H = (INP_H - KH) / STEP_H + 1;

    static constexpr int C_PER_M = C / GROUP;

    static constexpr int CKK_ROW_SIZE = (C_PER_M+15)/16*16;

    static constexpr int LAST_C = (C_PER_M % 16) / 2;


    alignas(32) TTPARAM (&weights)[M*CKK_ROW_SIZE];

    float x_scale;

    float w_scale;

    float y_scale;

    TT x_zero;

    TTPARAM w_zero;

    TT y_zero;


    // precomputation

    int scalebits;

    int16_t scale;


  public:

    QLinearConv1x1_2 (

      TTPARAM (&w)[M*CKK_ROW_SIZE],

      float x_scale,

      float w_scale,

      float y_scale,

      TT x_zero,

      TTPARAM w_zero,

      TT y_zero

    );


        void filter(

        input_window<TT>* in,

      input_stream<acc48>* cin,

      output_stream<TT>* out

        );


        static void registerKernelClass() {

      static_assert((std::is_same<TT, int8_t>::value) || (std::is_same<TT, uint8_t>::value));

      static_assert(KH==1);

      static_assert(KW==1);

      static_assert(INP_W%16==0);

      static_assert(OUT_W_PAD%16==0);

      static_assert(STEP_H == 1 || STEP_H == 2);

      static_assert(STEP_W == 1 || STEP_W == 2);

            REGISTER_FUNCTION(QLinearConv1x1_2::filter);

      REGISTER_PARAMETER(weights);

        }

};

#endif // QLINEARCONV_H_

QLinearConv1x1InputPackets
Vector implementation for 1x1 QLinearConv, stores weights requires data to be reshaped from (M,...
Definition qlinearconv.h:1077

QLinearConv1x1StreamInputPackets
Vector implementation for 1x1 QLinearConv, streams weights requires data to be reshaped from (M,...
Definition qlinearconv.h:1140

QLinearConv1x1Stream
Vector implementation for 1x1 QLinearConv, requires data to be reshaped from (M,C,...
Definition qlinearconv.h:1015

QLinearConv3x3
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:207

QLinearConv5x5Scale32bit
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:153

QLinearConv5x5
Vector implementation for QLinearConv 5x5, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:96

QLinearConv5x5::filter
void filter(input_window< TT > *in, output_window< TT > *out)
Definition qlinearconv.cc:134

QLinearConvHx4PktStream
Vector implementation for Hx4 QLinearConv, padding with y_zero, requires data to be arranged in (M,...
Definition qlinearconv.h:450

QLinearConvHx4StreamScale32bit
Vector implementation for Hx4 QLinearConv using 32bit scale for precision, requires data to be arrang...
Definition qlinearconv.h:386

QLinearConvHx4Stream
Vector implementation for Hx4 QLinearConv, requires data to be arranged in [a,b,c,...
Definition qlinearconv.h:320

QLinearConvHx6x8bitStream
Vector implementation for Hx4 QLinearConv using int8xint8 MACs, requires data to be arranged in [a,...
Definition qlinearconv.h:828

QLinearConvHx8PktStream
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:947

QLinearConvHx8
Vector implementation for Hx8 QLinearConv, requires bias to be shifted, i.e. tbias - tw....
Definition qlinearconv.h:884

QLinearConvScalarStream
Scalar implementation streaming weights, requires weights stream to be padded from MxCxKxK to MxCx16,...
Definition qlinearconv.h:266

QLinearConvScalar
Scalar implementation, QLinearConvScalar<30,32,28,32,1,1,1,1,6,5> total = 1282213,...
Definition qlinearconv.h:43