Initial commit

author: Daya S Khudia <dskhudia@fb.com> 2018-10-13 01:48:13 +0300
committer: Daya S Khudia <dskhudia@fb.com> 2018-10-31 00:56:00 +0300
commit: e85b5a12254fa47ca6b56236489253a68fd32104 (patch)
tree: d62190c53913c65e136fb26dc89bfab38144e2c3 /src/FbgemmI8Depthwise.h
1 files changed, 105 insertions, 0 deletions
diff --git a/src/FbgemmI8Depthwise.h b/src/FbgemmI8Depthwise.h
new file mode 100644
index 0000000..bc62c84
--- /dev/null
+++ b/src/FbgemmI8Depthwise.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace fbgemm2
+{
+
+// KERNEL_PROD is the product of all kernels.
+// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3.
+template <int KERNEL_PROD>
+class PackedDepthWiseConvMatrix
+{
+ public:
+  // smat in RSG layout
+  PackedDepthWiseConvMatrix(int K, const std::int8_t *smat);
+  virtual ~PackedDepthWiseConvMatrix();
+
+  const std::int8_t* PackedMat() const {
+    return pmat_;
+  }
+
+ private:
+  int K_;
+  std::int8_t* pmat_;
+}; // Packed3x3ConvMatrix
+
+using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
+using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
+
+/**
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * @params A The input image in NHWK layout
+ * @params Bp The pre-packed filter
+ */
+void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point, const std::uint8_t* A,
+    const Packed3x3ConvMatrix& Bp,
+    std::int32_t* C,
+    int thread_id = 0, int num_threads = 1);
+
+/**
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization.
+ */
+void depthwise_3x3_pad_1(
+    int N, int H, int W, int K,
+    int stride_h, int stride_w,
+    std::int32_t A_zero_point, const std::uint8_t* A,
+    std::int32_t B_zero_point, const Packed3x3ConvMatrix& Bp,
+    float C_multiplier, std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets, const std::int32_t* bias,
+    int thread_id = 0, int num_threads = 1, bool fuse_relu = false);
+
+/**
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization and uses per-channel quantization.
+ */
+void depthwise_3x3_per_channel_quantization_pad_1(
+    int N, int H, int W, int K,
+    int stride_h, int stride_w,
+    std::int32_t A_zero_point, const std::uint8_t* A,
+    const std::int32_t *B_zero_point, const Packed3x3ConvMatrix& Bp,
+    const float *C_multiplier, std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets, const std::int32_t* bias,
+    int thread_id = 0, int num_threads = 1);
+
+void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point, const std::uint8_t* A,
+    const Packed3x3x3ConvMatrix& Bp,
+    std::int32_t* C,
+    int thread_id = 0, int num_threads = 1);
+
+void depthwise_3x3x3_pad_1(
+    int N, int T, int H, int W, int K,
+    int stride_t, int stride_h, int stride_w,
+    std::int32_t A_zero_point, const std::uint8_t* A,
+    std::int32_t B_zero_point, const Packed3x3x3ConvMatrix& Bp,
+    float C_multiplier, std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets, const std::int32_t* bias,
+    bool fuse_relu = false, int thread_id = 0, int num_threads = 1);
+
+} // namespace fbgemm2
author	Daya S Khudia <dskhudia@fb.com>	2018-10-13 01:48:13 +0300
committer	Daya S Khudia <dskhudia@fb.com>	2018-10-31 00:56:00 +0300
commit	e85b5a12254fa47ca6b56236489253a68fd32104 (patch)
tree	d62190c53913c65e136fb26dc89bfab38144e2c3 /src/FbgemmI8Depthwise.h