Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaya Khudia <dskhudia@fb.com>2019-09-11 21:47:58 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-09-11 21:52:06 +0300
commit415035019ccbca2b11b62f1503fdd61e8bc59b10 (patch)
treea970118ee111cb6531ddc7527f720fd94c5fa0cf
parent685b1855d739868839cd8f774d987c1a4599b138 (diff)
API changes to take unquantized bias for depthwise conv
Summary: Changing interface for on the fly bias quantization Also adding code to quantize bias on the fly Reviewed By: jianyuh Differential Revision: D17099709 fbshipit-source-id: 5cca79189c00710e703044350260a9fcaca77bb3
-rw-r--r--bench/DepthwiseBenchmark.cc1
-rw-r--r--include/fbgemm/FbgemmI8DepthwiseAvx2.h116
-rw-r--r--src/FbgemmConv.cc4
-rw-r--r--src/FbgemmI8DepthwiseAvx2.cc957
-rw-r--r--test/I8DepthwiseTest.cc1
5 files changed, 923 insertions, 156 deletions
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 4f36e6c..6b2f8b8 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -267,6 +267,7 @@ int main() {
col_offsets.data(),
bias.data(),
false, /* fuse_relu */
+ 1.0f, /* act_scale * w_scale */
tid,
num_threads);
}
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index 98c4ed7..19946cf 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -50,12 +50,39 @@ using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ std::int32_t A_zero_point,
+ const std::uint8_t* A,
+ std::int32_t B_zero_point,
+ const Packed3x3ConvMatrix& Bp,
+ float C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
+ const std::int32_t* bias,
+ bool fuse_relu = false,
+ int thread_id = 0,
+ int num_threads = 1);
+
/**
- * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
- * requantization.
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization.
*
* @col_offsets nullptr if col_offsets are folded into bias
+ * @act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is
+ * unquantized.
*/
+template <typename BIAS_TYPE = std::int32_t>
FBGEMM_API void depthwise_3x3_pad_1(
int N,
int H,
@@ -71,8 +98,9 @@ FBGEMM_API void depthwise_3x3_pad_1(
std::int32_t C_zero_point,
std::uint8_t* C,
const std::int32_t* col_offsets,
- const std::int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu = false,
+ float act_times_w_scale = 1.0f,
int thread_id = 0,
int num_threads = 1);
@@ -82,6 +110,31 @@ FBGEMM_API void depthwise_3x3_pad_1(
*
* @col_offsets nullptr if col_offsets are folded into bias
*/
+template <typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ std::int32_t A_zero_point,
+ const std::uint8_t* A,
+ const std::int32_t* B_zero_point,
+ const Packed3x3ConvMatrix& Bp,
+ const float* C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
+ const BIAS_TYPE* bias,
+ bool fuse_relu = false,
+ const float* act_times_w_scale = nullptr,
+ int thread_id = 0,
+ int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ */
FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
int N,
int H,
@@ -102,9 +155,35 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
int thread_id = 0,
int num_threads = 1);
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ std::int32_t A_zero_point,
+ const std::uint8_t* A,
+ std::int32_t B_zero_point,
+ const Packed3x3x3ConvMatrix& Bp,
+ float C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
+ const std::int32_t* bias,
+ bool fuse_relu = false,
+ int thread_id = 0,
+ int num_threads = 1);
/**
* @col_offsets nullptr if col_offsets are folded into bias
*/
+template <typename BIAS_TYPE = std::int32_t>
FBGEMM_API void depthwise_3x3x3_pad_1(
int N,
int T,
@@ -122,6 +201,33 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
std::int32_t C_zero_point,
std::uint8_t* C,
const std::int32_t* col_offsets,
+ const BIAS_TYPE* bias,
+ bool fuse_relu = false,
+ float act_times_w_scale = 1.0f,
+ int thread_id = 0,
+ int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ std::int32_t A_zero_point,
+ const std::uint8_t* A,
+ const std::int32_t* B_zero_point,
+ const Packed3x3x3ConvMatrix& Bp,
+ const float* C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
const std::int32_t* bias,
bool fuse_relu = false,
int thread_id = 0,
@@ -130,6 +236,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
/**
* @col_offsets nullptr if col_offsets are folded into bias
*/
+template <typename BIAS_TYPE = std::int32_t>
FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
int N,
int T,
@@ -147,8 +254,9 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
std::int32_t C_zero_point,
std::uint8_t* C,
const std::int32_t* col_offsets,
- const std::int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu = false,
+ const float* act_times_w_scale = nullptr,
int thread_id = 0,
int num_threads = 1);
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 164411d..6a1e55b 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -115,6 +115,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
+ 1.0f, // act_scale * weight_scale
thread_id,
num_threads);
} else if (
@@ -140,6 +141,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
+ nullptr, // act_scale * weight_scale
thread_id,
num_threads);
} else {
@@ -167,6 +169,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
+ 1.0f, // act_scale * weight_scale
thread_id,
num_threads);
} else if (
@@ -191,6 +194,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
+ nullptr, // act_scale * weight_scale
thread_id,
num_threads);
} else {
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc
index 7454ef4..aa7b90e 100644
--- a/src/FbgemmI8DepthwiseAvx2.cc
+++ b/src/FbgemmI8DepthwiseAvx2.cc
@@ -10,6 +10,7 @@
#include <cassert>
#include <cmath> // for lrintf and sqrt
#include <tuple> // for tie
+#include <type_traits> // for is_same
#include <immintrin.h>
@@ -578,7 +579,8 @@ template <
bool HAS_BIAS,
bool PER_CHANNEL_QUANTIZATION,
bool A_SYMMETRIC,
- bool B_SYMMETRIC>
+ bool B_SYMMETRIC,
+ typename BIAS_TYPE>
static inline __attribute__((always_inline)) void requantize_(
int32_t A_zero_point,
const float* C_multiplier,
@@ -588,10 +590,16 @@ static inline __attribute__((always_inline)) void requantize_(
int n,
const int32_t* row_offsets,
const int32_t* col_offsets,
- const int32_t* bias) {
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale = nullptr) {
__m256 multiplier_v = _mm256_setzero_ps();
+ // Broadcasted reciprocal of act_times_w_scale
+ __m256 act_times_w_rcp_v = _mm256_setzero_ps();
if (!PER_CHANNEL_QUANTIZATION) {
multiplier_v = _mm256_set1_ps(*C_multiplier);
+ if (is_same<BIAS_TYPE, float>::value) {
+ act_times_w_rcp_v = _mm256_set1_ps(1.0f / (*act_times_w_scale));
+ }
}
__m256i min_v = _mm256_set1_epi8(static_cast<uint8_t>(0));
@@ -673,39 +681,95 @@ static inline __attribute__((always_inline)) void requantize_(
w_v = _mm256_sub_epi32(w_v, col_off_v);
}
+ // convert to float
+ __m256 xf_v, yf_v, zf_v, wf_v;
if (HAS_BIAS) { // static if
- x_v = _mm256_add_epi32(
- x_v, _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias + j)));
- y_v = _mm256_add_epi32(
- y_v,
- _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(bias + j + VLEN)));
- z_v = _mm256_add_epi32(
- z_v,
- _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(bias + j + 2 * VLEN)));
- w_v = _mm256_add_epi32(
- w_v,
- _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(bias + j + 3 * VLEN)));
+ if (is_same<BIAS_TYPE, float>::value) {
+ __m256 x_bias_v, y_bias_v, z_bias_v, w_bias_v;
+ if (PER_CHANNEL_QUANTIZATION) {
+ x_bias_v = _mm256_div_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 0 * VLEN)),
+ _mm256_loadu_ps(act_times_w_scale + j + 0 * VLEN));
+ y_bias_v = _mm256_div_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 1 * VLEN)),
+ _mm256_loadu_ps(act_times_w_scale + j + 1 * VLEN));
+ z_bias_v = _mm256_div_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 2 * VLEN)),
+ _mm256_loadu_ps(act_times_w_scale + j + 2 * VLEN));
+ w_bias_v = _mm256_div_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 3 * VLEN)),
+ _mm256_loadu_ps(act_times_w_scale + j + 3 * VLEN));
+ } else {
+ x_bias_v = _mm256_mul_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 0 * VLEN)),
+ act_times_w_rcp_v);
+ y_bias_v = _mm256_mul_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 1 * VLEN)),
+ act_times_w_rcp_v);
+ z_bias_v = _mm256_mul_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 2 * VLEN)),
+ act_times_w_rcp_v);
+ w_bias_v = _mm256_mul_ps(
+ _mm256_loadu_ps(
+ reinterpret_cast<const float*>(bias + j + 3 * VLEN)),
+ act_times_w_rcp_v);
+ }
+ xf_v = _mm256_add_ps(_mm256_cvtepi32_ps(x_v), x_bias_v);
+ yf_v = _mm256_add_ps(_mm256_cvtepi32_ps(y_v), y_bias_v);
+ zf_v = _mm256_add_ps(_mm256_cvtepi32_ps(z_v), z_bias_v);
+ wf_v = _mm256_add_ps(_mm256_cvtepi32_ps(w_v), w_bias_v);
+ } else {
+ x_v = _mm256_add_epi32(
+ x_v,
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(bias + j + 0 * VLEN)));
+ y_v = _mm256_add_epi32(
+ y_v,
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(bias + j + 1 * VLEN)));
+ z_v = _mm256_add_epi32(
+ z_v,
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(bias + j + 2 * VLEN)));
+ w_v = _mm256_add_epi32(
+ w_v,
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(bias + j + 3 * VLEN)));
+ xf_v = _mm256_cvtepi32_ps(x_v);
+ yf_v = _mm256_cvtepi32_ps(y_v);
+ zf_v = _mm256_cvtepi32_ps(z_v);
+ wf_v = _mm256_cvtepi32_ps(w_v);
+ }
+ } else {
+ xf_v = _mm256_cvtepi32_ps(x_v);
+ yf_v = _mm256_cvtepi32_ps(y_v);
+ zf_v = _mm256_cvtepi32_ps(z_v);
+ wf_v = _mm256_cvtepi32_ps(w_v);
}
if (PER_CHANNEL_QUANTIZATION) {
- multiplier_v = _mm256_loadu_ps(C_multiplier + j);
+ multiplier_v = _mm256_loadu_ps(C_multiplier + j + 0 * VLEN);
}
- __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ __m256 x_scaled_v = _mm256_mul_ps(xf_v, multiplier_v);
if (PER_CHANNEL_QUANTIZATION) {
- multiplier_v = _mm256_loadu_ps(C_multiplier + j + VLEN);
+ multiplier_v = _mm256_loadu_ps(C_multiplier + j + 1 * VLEN);
}
- __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
+ __m256 y_scaled_v = _mm256_mul_ps(yf_v, multiplier_v);
if (PER_CHANNEL_QUANTIZATION) {
multiplier_v = _mm256_loadu_ps(C_multiplier + j + 2 * VLEN);
}
- __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
+ __m256 z_scaled_v = _mm256_mul_ps(zf_v, multiplier_v);
if (PER_CHANNEL_QUANTIZATION) {
multiplier_v = _mm256_loadu_ps(C_multiplier + j + 3 * VLEN);
}
- __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+ __m256 w_scaled_v = _mm256_mul_ps(wf_v, multiplier_v);
__m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
__m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v);
@@ -745,15 +809,35 @@ static inline __attribute__((always_inline)) void requantize_(
x_v = _mm256_sub_epi32(x_v, col_off_v);
}
+ // Convert to float
+ __m256 xf_v;
if (HAS_BIAS) { // static if
- x_v = _mm256_add_epi32(
- x_v, _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias + j)));
+ if (is_same<BIAS_TYPE, float>::value) {
+ __m256 x_bias_v;
+ if (PER_CHANNEL_QUANTIZATION) {
+ x_bias_v = _mm256_div_ps(
+ _mm256_loadu_ps(reinterpret_cast<const float*>(bias + j)),
+ _mm256_loadu_ps(act_times_w_scale + j));
+ } else {
+ x_bias_v = _mm256_mul_ps(
+ _mm256_loadu_ps(reinterpret_cast<const float*>(bias + j)),
+ act_times_w_rcp_v);
+ }
+ xf_v = _mm256_add_ps(_mm256_cvtepi32_ps(x_v), x_bias_v);
+ } else {
+ x_v = _mm256_add_epi32(
+ x_v,
+ _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias + j)));
+ xf_v = _mm256_cvtepi32_ps(x_v);
+ }
+ } else {
+ xf_v = _mm256_cvtepi32_ps(x_v);
}
if (PER_CHANNEL_QUANTIZATION) {
multiplier_v = _mm256_loadu_ps(C_multiplier + j);
}
- __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ __m256 x_scaled_v = _mm256_mul_ps(xf_v, multiplier_v);
__m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
__m256i x_packed_v = _mm256_adds_epi16(
@@ -779,11 +863,20 @@ static inline __attribute__((always_inline)) void requantize_(
if (!A_SYMMETRIC) {
raw -= A_zero_point * col_offsets[j];
}
+ float raw_f;
if (HAS_BIAS) { // static if
- raw += bias[j];
+ if (is_same<BIAS_TYPE, float>::value) {
+ raw_f = raw;
+ raw_f += bias[j] / act_times_w_scale[PER_CHANNEL_QUANTIZATION ? j : 0];
+ } else {
+ raw += bias[j];
+ raw_f = raw;
+ }
+ } else {
+ raw_f = raw;
}
- float ab = raw * C_multiplier[PER_CHANNEL_QUANTIZATION ? j : 0];
+ float ab = raw_f * C_multiplier[PER_CHANNEL_QUANTIZATION ? j : 0];
long rounded = lrintf(ab) + C_zero_point;
C_uint8[j] = std::max(
@@ -1165,7 +1258,12 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
}
}
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+ bool FUSE_RELU,
+ bool HAS_BIAS,
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ typename BIAS_TYPE>
static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
int H,
int W,
@@ -1184,7 +1282,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
uint8_t* C_uint8,
int32_t* row_offsets,
const int32_t* col_offsets,
- const int32_t* bias) {
+ const BIAS_TYPE* bias,
+ float act_times_w_scale) {
constexpr int S = 3;
constexpr int PAD_T = 1, PAD_L = 1, PAD_R = 1;
int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
@@ -1229,7 +1328,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
HAS_BIAS,
false, /*PER_CHAN_QUANT*/
A_SYMMETRIC,
- B_SYMMETRIC>(
+ B_SYMMETRIC,
+ BIAS_TYPE>(
A_zero_point,
&C_multiplier,
C_zero_point,
@@ -1238,10 +1338,16 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
K,
row_offsets,
col_offsets,
- bias);
+ bias,
+ &act_times_w_scale);
}
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+ bool FUSE_RELU,
+ bool HAS_BIAS,
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ typename BIAS_TYPE>
static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
int T,
int H,
@@ -1263,7 +1369,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
uint8_t* C_uint8,
int32_t* row_offsets,
const int32_t* col_offsets,
- const int32_t* bias) {
+ const BIAS_TYPE* bias,
+ float act_times_w_scale) {
constexpr int R = 3, S = 3;
constexpr int PAD_P = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
@@ -1323,10 +1430,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
K,
row_offsets,
col_offsets,
- bias);
+ bias,
+ &act_times_w_scale);
}
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
static inline __attribute__((always_inline)) void
depthwise_3x3_per_channel_quantization_kernel_(
int H,
@@ -1346,7 +1454,8 @@ depthwise_3x3_per_channel_quantization_kernel_(
uint8_t* C_uint8,
int32_t* row_offsets,
const int32_t* col_offsets,
- const int32_t* bias) {
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale) {
constexpr int S = 3;
constexpr int PAD_T = 1, PAD_L = 1, PAD_R = 1;
int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
@@ -1397,7 +1506,8 @@ depthwise_3x3_per_channel_quantization_kernel_(
HAS_BIAS,
true, /*PER_CHAN_QUANT*/
A_SYMMETRIC,
- false /*B_SYMM*/>(
+ false, /*B_SYMM*/
+ BIAS_TYPE>(
A_zero_point,
C_multiplier,
C_zero_point,
@@ -1406,10 +1516,11 @@ depthwise_3x3_per_channel_quantization_kernel_(
K,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
static inline __attribute__((always_inline)) void
depthwise_3x3x3_per_channel_quantization_kernel_(
int T,
@@ -1432,7 +1543,8 @@ depthwise_3x3x3_per_channel_quantization_kernel_(
uint8_t* C_uint8,
int32_t* row_offsets,
const int32_t* col_offsets,
- const int32_t* bias) {
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale) {
constexpr int R = 3, S = 3;
constexpr int PAD_P = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
@@ -1497,7 +1609,8 @@ depthwise_3x3x3_per_channel_quantization_kernel_(
K,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
static pair<int, int> closest_factors_(int n) {
@@ -1512,7 +1625,12 @@ static pair<int, int> closest_factors_(int n) {
// This implemntation should be general enough to handle not just 3x3 but other
// filter shapes by parameterizing with R and S but restricting it to just 3x3
// for now.
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+ bool FUSE_RELU,
+ bool HAS_BIAS,
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ typename BIAS_TYPE>
static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
int N,
int H,
@@ -1529,7 +1647,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
int32_t* C_int32,
uint8_t* C_uint8,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
assert(K % 8 == 0);
@@ -1588,7 +1707,12 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
if (h_begin == 0) {
if (w_begin == 0) {
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1606,11 +1730,17 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1628,12 +1758,18 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
if (w_end == W_OUT) {
w = W_OUT - 1;
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1651,14 +1787,20 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
}
for (h = std::max(1, h_begin); h < std::min(H - 1, h_end); ++h) {
if (w_begin == 0) {
w = 0;
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1676,11 +1818,17 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1698,12 +1846,18 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
if (w_end == W_OUT) {
w = W_OUT - 1;
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1721,7 +1875,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
}
@@ -1729,7 +1884,12 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
h = H_OUT - 1;
w = 0;
if (w_begin == 0) {
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1747,11 +1907,17 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1769,12 +1935,18 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
if (w_end == W_OUT) {
w = W_OUT - 1;
- depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+ depthwise_3x3_kernel_<
+ FUSE_RELU,
+ HAS_BIAS,
+ A_SYMMETRIC,
+ B_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -1792,13 +1964,19 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
}
} // for each n
};
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+ bool FUSE_RELU,
+ bool HAS_BIAS,
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ typename BIAS_TYPE>
static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
int N,
int T,
@@ -1817,7 +1995,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
int32_t* C_int32,
uint8_t* C_uint8,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
assert(K % 8 == 0);
@@ -1901,14 +2080,15 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
} // w
} // h
} // t
} // for each n
};
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
static inline __attribute__((always_inline)) void
depthwise_3x3_per_channel_quantization_pad_1_(
int N,
@@ -1926,7 +2106,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
int32_t* C_int32,
uint8_t* C_uint8,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
assert(K % 8 == 0);
@@ -1988,7 +2169,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2006,14 +2188,16 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2031,7 +2215,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
if (w_end == W_OUT) {
@@ -2039,7 +2224,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2057,7 +2243,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
}
@@ -2067,7 +2254,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2085,14 +2273,16 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2110,7 +2300,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
if (w_end == W_OUT) {
@@ -2118,7 +2309,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2136,7 +2328,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
}
@@ -2147,7 +2340,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2165,14 +2359,16 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2190,7 +2386,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
if (w_end == W_OUT) {
@@ -2198,7 +2395,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
H,
W,
K,
@@ -2216,13 +2414,14 @@ depthwise_3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
}
}
} // for each n
};
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
static inline __attribute__((always_inline)) void
depthwise_3x3x3_per_channel_quantization_pad_1_(
int N,
@@ -2242,7 +2441,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
int32_t* C_int32,
uint8_t* C_uint8,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
assert(K % 8 == 0);
@@ -2304,7 +2504,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
depthwise_3x3x3_per_channel_quantization_kernel_<
FUSE_RELU,
HAS_BIAS,
- A_SYMMETRIC>(
+ A_SYMMETRIC,
+ BIAS_TYPE>(
T,
H,
W,
@@ -2325,7 +2526,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
C_uint8_base,
row_offsets,
col_offsets,
- bias);
+ bias,
+ act_times_w_scale);
} // w
} // h
} // t
@@ -2333,7 +2535,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
};
// Dispatch A_SYMMETRIC and B_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
static void depthwise_3x3_pad_1_(
int N,
int H,
@@ -2349,7 +2551,8 @@ static void depthwise_3x3_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -2359,7 +2562,8 @@ static void depthwise_3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
true /*A_symmetric*/,
- true /*B_symmetric*/>(
+ true /*B_symmetric*/,
+ BIAS_TYPE>(
N,
H,
W,
@@ -2376,6 +2580,7 @@ static void depthwise_3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
@@ -2383,7 +2588,8 @@ static void depthwise_3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
true /*A_symmetric*/,
- false /*B_symmetric*/>(
+ false /*B_symmetric*/,
+ BIAS_TYPE>(
N,
H,
W,
@@ -2400,6 +2606,7 @@ static void depthwise_3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -2409,7 +2616,8 @@ static void depthwise_3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
false /*A_symmetric*/,
- true /*B_symmetric*/>(
+ true /*B_symmetric*/,
+ BIAS_TYPE>(
N,
H,
W,
@@ -2426,6 +2634,7 @@ static void depthwise_3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
@@ -2433,7 +2642,8 @@ static void depthwise_3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
false /*A_symmetric*/,
- false /*B_symmetric*/>(
+ false /*B_symmetric*/,
+ BIAS_TYPE>(
N,
H,
W,
@@ -2450,6 +2660,7 @@ static void depthwise_3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -2457,7 +2668,7 @@ static void depthwise_3x3_pad_1_(
}
// Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
static void depthwise_3x3_pad_1_(
int N,
int H,
@@ -2473,11 +2684,12 @@ static void depthwise_3x3_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
if (bias) {
- depthwise_3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/>(
+ depthwise_3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/, BIAS_TYPE>(
N,
H,
W,
@@ -2493,10 +2705,11 @@ static void depthwise_3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/>(
+ depthwise_3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/, BIAS_TYPE>(
N,
H,
W,
@@ -2512,6 +2725,7 @@ static void depthwise_3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -2519,6 +2733,7 @@ static void depthwise_3x3_pad_1_(
// Dispatch input shape and FUSE_RELU
// assumption: W > 3 and H > 3
+template <typename BIAS_TYPE>
void depthwise_3x3_pad_1(
int N,
int H,
@@ -2534,8 +2749,9 @@ void depthwise_3x3_pad_1(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
if (stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -2548,7 +2764,7 @@ void depthwise_3x3_pad_1(
}
if (fuse_relu) {
if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
- depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2564,10 +2780,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
- depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2583,10 +2800,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (1 == stride_h && 1 == stride_w) {
- depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2602,10 +2820,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (2 == stride_h && 2 == stride_w) {
- depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2621,10 +2840,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2640,12 +2860,13 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
} else {
if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
- depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2661,10 +2882,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
- depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2680,10 +2902,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (1 == stride_h && 1 == stride_w) {
- depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2699,10 +2922,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (2 == stride_h && 2 == stride_w) {
- depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2718,10 +2942,11 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
N,
H,
W,
@@ -2737,6 +2962,7 @@ void depthwise_3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -2744,7 +2970,7 @@ void depthwise_3x3_pad_1(
}
// Dispatch A_SYMMETRIC and B_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
static void depthwise_3x3x3_pad_1_(
int N,
int T,
@@ -2762,7 +2988,8 @@ static void depthwise_3x3x3_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -2772,7 +2999,8 @@ static void depthwise_3x3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
true /*A_symmetric*/,
- true /*B_symmetric*/>(
+ true /*B_symmetric*/,
+ BIAS_TYPE>(
N,
T,
H,
@@ -2791,6 +3019,7 @@ static void depthwise_3x3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
@@ -2798,7 +3027,8 @@ static void depthwise_3x3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
true /*A_symmetric*/,
- false /*B_symmetric*/>(
+ false /*B_symmetric*/,
+ BIAS_TYPE>(
N,
T,
H,
@@ -2817,6 +3047,7 @@ static void depthwise_3x3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -2826,7 +3057,8 @@ static void depthwise_3x3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
false /*A_symmetric*/,
- true /*B_symmetric*/>(
+ true /*B_symmetric*/,
+ BIAS_TYPE>(
N,
T,
H,
@@ -2845,6 +3077,7 @@ static void depthwise_3x3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
@@ -2852,7 +3085,8 @@ static void depthwise_3x3x3_pad_1_(
FUSE_RELU,
HAS_BIAS,
false /*A_symmetric*/,
- false /*B_symmetric*/>(
+ false /*B_symmetric*/,
+ BIAS_TYPE>(
N,
T,
H,
@@ -2871,6 +3105,7 @@ static void depthwise_3x3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -2878,7 +3113,7 @@ static void depthwise_3x3x3_pad_1_(
}
// Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
static void depthwise_3x3x3_pad_1_(
int N,
int T,
@@ -2896,11 +3131,12 @@ static void depthwise_3x3x3_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
if (bias) {
- depthwise_3x3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/>(
+ depthwise_3x3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/, BIAS_TYPE>(
N,
T,
H,
@@ -2918,10 +3154,11 @@ static void depthwise_3x3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/>(
+ depthwise_3x3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/, BIAS_TYPE>(
N,
T,
H,
@@ -2939,12 +3176,14 @@ static void depthwise_3x3x3_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
// Dispatch FUSE_RELU
+template <typename BIAS_TYPE>
void depthwise_3x3x3_pad_1(
int N,
int T,
@@ -2962,8 +3201,9 @@ void depthwise_3x3x3_pad_1(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu,
+ float act_times_w_scale,
int thread_id,
int num_threads) {
if (stride_t == 0 || stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -2977,7 +3217,7 @@ void depthwise_3x3x3_pad_1(
return;
}
if (fuse_relu) {
- depthwise_3x3x3_pad_1_<true /*FUSE_RELU*/>(
+ depthwise_3x3x3_pad_1_<true /*FUSE_RELU*/, BIAS_TYPE>(
N,
T,
H,
@@ -2995,10 +3235,11 @@ void depthwise_3x3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3x3_pad_1_<false /*FUSE_RELU*/>(
+ depthwise_3x3x3_pad_1_<false /*FUSE_RELU*/, BIAS_TYPE>(
N,
T,
H,
@@ -3016,13 +3257,14 @@ void depthwise_3x3x3_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
// Dispatch A_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
static void depthwise_3x3_per_channel_quantization_pad_1_(
int N,
int H,
@@ -3038,7 +3280,8 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -3046,7 +3289,8 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
depthwise_3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
HAS_BIAS,
- true /*A_SYMM*/>(
+ true /*A_SYMM*/,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3063,13 +3307,15 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
depthwise_3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
HAS_BIAS,
- false /*A_SYMM*/>(
+ false /*A_SYMM*/,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3086,13 +3332,14 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
// Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
static void depthwise_3x3_per_channel_quantization_pad_1_(
int N,
int H,
@@ -3108,13 +3355,15 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
if (bias) {
depthwise_3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
- true /* HAS_BIAS */>(
+ true /* HAS_BIAS */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3130,12 +3379,14 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
depthwise_3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
- false /* HAS_BIAS */>(
+ false /* HAS_BIAS */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3151,12 +3402,14 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
// Dispatch input shape and FUSE_RELU
+template <typename BIAS_TYPE>
void depthwise_3x3_per_channel_quantization_pad_1(
int N,
int H,
@@ -3172,8 +3425,9 @@ void depthwise_3x3_per_channel_quantization_pad_1(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
if (stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -3186,7 +3440,9 @@ void depthwise_3x3_per_channel_quantization_pad_1(
}
if (fuse_relu) {
if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ true /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3202,10 +3458,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ true /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3221,10 +3480,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (1 == stride_h && 1 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ true /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3240,10 +3502,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (2 == stride_h && 2 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ true /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3259,10 +3524,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ true /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3278,12 +3546,15 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
} else {
if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ false /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3299,10 +3570,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ false /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3318,10 +3592,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (1 == stride_h && 1 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ false /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3337,10 +3614,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else if (2 == stride_h && 2 == stride_w) {
- depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ false /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3356,10 +3636,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3_per_channel_quantization_pad_1_<
+ false /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
H,
W,
@@ -3375,6 +3658,7 @@ void depthwise_3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
@@ -3382,7 +3666,7 @@ void depthwise_3x3_per_channel_quantization_pad_1(
}
// Dispatch A_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
static void depthwise_3x3x3_per_channel_quantization_pad_1_(
int N,
int T,
@@ -3400,7 +3684,8 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -3408,7 +3693,8 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
depthwise_3x3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
HAS_BIAS,
- true /*A_SYMM*/>(
+ true /*A_SYMM*/,
+ BIAS_TYPE>(
N,
T,
H,
@@ -3427,13 +3713,15 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
depthwise_3x3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
HAS_BIAS,
- false /*A_SYMM*/>(
+ false /*A_SYMM*/,
+ BIAS_TYPE>(
N,
T,
H,
@@ -3452,13 +3740,14 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
// Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
static void depthwise_3x3x3_per_channel_quantization_pad_1_(
int N,
int T,
@@ -3476,13 +3765,15 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
if (bias) {
depthwise_3x3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
- true /* HAS_BIAS */>(
+ true /* HAS_BIAS */,
+ BIAS_TYPE>(
N,
T,
H,
@@ -3500,12 +3791,14 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
depthwise_3x3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
- false /* HAS_BIAS */>(
+ false /* HAS_BIAS */,
+ BIAS_TYPE>(
N,
T,
H,
@@ -3523,12 +3816,14 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
// Dispatch FUSE_RELU
+template <typename BIAS_TYPE>
void depthwise_3x3x3_per_channel_quantization_pad_1(
int N,
int T,
@@ -3546,8 +3841,9 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
int32_t C_zero_point,
uint8_t* C,
const int32_t* col_offsets,
- const int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu,
+ const float* act_times_w_scale,
int thread_id,
int num_threads) {
if (stride_t == 0 || stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -3561,7 +3857,9 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
return;
}
if (fuse_relu) {
- depthwise_3x3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+ depthwise_3x3x3_per_channel_quantization_pad_1_<
+ true /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
T,
H,
@@ -3579,10 +3877,13 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
} else {
- depthwise_3x3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+ depthwise_3x3x3_per_channel_quantization_pad_1_<
+ false /* FUSE_RELU */,
+ BIAS_TYPE>(
N,
T,
H,
@@ -3600,9 +3901,361 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
C,
col_offsets,
bias,
+ act_times_w_scale,
thread_id,
num_threads);
}
}
+// To be removed
+void depthwise_3x3_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ int32_t B_zero_point,
+ const Packed3x3ConvMatrix& B,
+ float C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ int thread_id,
+ int num_threads) {
+ depthwise_3x3_pad_1<std::int32_t>(
+ N,
+ H,
+ W,
+ K,
+ stride_h,
+ stride_w,
+ A_zero_point,
+ A,
+ B_zero_point,
+ B,
+ C_multiplier,
+ C_zero_point,
+ C,
+ col_offsets,
+ bias,
+ fuse_relu,
+ 1.0f,
+ thread_id,
+ num_threads);
+}
+
+// To be removed
+void depthwise_3x3_per_channel_quantization_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ const int32_t* B_zero_point,
+ const Packed3x3ConvMatrix& Bp,
+ const float* C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ int thread_id,
+ int num_threads) {
+ depthwise_3x3_per_channel_quantization_pad_1<std::int32_t>(
+ N,
+ H,
+ W,
+ K,
+ stride_h,
+ stride_w,
+ A_zero_point,
+ A,
+ B_zero_point,
+ Bp,
+ C_multiplier,
+ C_zero_point,
+ C,
+ col_offsets,
+ bias,
+ fuse_relu,
+ nullptr,
+ thread_id,
+ num_threads);
+}
+
+// To be removed
+void depthwise_3x3x3_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ int32_t B_zero_point,
+ const Packed3x3x3ConvMatrix& B,
+ float C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ int thread_id,
+ int num_threads) {
+ depthwise_3x3x3_pad_1<int32_t>(
+ N,
+ T,
+ H,
+ W,
+ K,
+ stride_t,
+ stride_h,
+ stride_w,
+ A_zero_point,
+ A,
+ B_zero_point,
+ B,
+ C_multiplier,
+ C_zero_point,
+ C,
+ col_offsets,
+ bias,
+ fuse_relu,
+ 1.0f, // act_scale * weight_scale
+ thread_id,
+ num_threads);
+}
+
+void depthwise_3x3x3_per_channel_quantization_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ const int32_t* B_zero_point,
+ const Packed3x3x3ConvMatrix& B,
+ const float* C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ int thread_id,
+ int num_threads) {
+ depthwise_3x3x3_per_channel_quantization_pad_1(
+ N,
+ T,
+ H,
+ W,
+ K,
+ stride_t,
+ stride_h,
+ stride_w,
+ A_zero_point,
+ A,
+ B_zero_point,
+ B,
+ C_multiplier,
+ C_zero_point,
+ C,
+ col_offsets,
+ bias,
+ fuse_relu,
+ nullptr, // act_scale * weight_scale
+ thread_id,
+ num_threads);
+}
+
+template void depthwise_3x3_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ int32_t B_zero_point,
+ const Packed3x3ConvMatrix& B,
+ float C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ float act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ int32_t B_zero_point,
+ const Packed3x3ConvMatrix& B,
+ float C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const float* bias,
+ bool fuse_relu,
+ float act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3_per_channel_quantization_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ const int32_t* B_zero_point,
+ const Packed3x3ConvMatrix& Bp,
+ const float* C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ const float* act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3_per_channel_quantization_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ const int32_t* B_zero_point,
+ const Packed3x3ConvMatrix& Bp,
+ const float* C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const float* bias,
+ bool fuse_relu,
+ const float* act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3x3_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ int32_t B_zero_point,
+ const Packed3x3x3ConvMatrix& B,
+ float C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ float act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3x3_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ int32_t B_zero_point,
+ const Packed3x3x3ConvMatrix& B,
+ float C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const float* bias,
+ bool fuse_relu,
+ float act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3x3_per_channel_quantization_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ const int32_t* B_zero_point,
+ const Packed3x3x3ConvMatrix& B,
+ const float* C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const int32_t* bias,
+ bool fuse_relu,
+ const float* act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
+template void depthwise_3x3x3_per_channel_quantization_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ int32_t A_zero_point,
+ const uint8_t* A,
+ const int32_t* B_zero_point,
+ const Packed3x3x3ConvMatrix& B,
+ const float* C_multiplier,
+ int32_t C_zero_point,
+ uint8_t* C,
+ const int32_t* col_offsets,
+ const float* bias,
+ bool fuse_relu,
+ const float* act_times_w_scale,
+ int thread_id,
+ int num_threads);
+
} // namespace fbgemm
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index 00c39c0..8492acb 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -212,6 +212,7 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) {
a_symmetric ? nullptr : col_offsets.data(),
bias.data(),
false, /* fuse_relu */
+ 1.0f, /* act_scale * w_scale */
0,
1);