diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-09-25 21:46:49 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-09-25 21:46:49 +0300 |
commit | 604620b78663d2bed318efba0ceb6d3ebadd14fb (patch) | |
tree | 8c84921ecd5d49f95aa24891d3bc8ecc03a3eb5f | |
parent | d02815ffedbc46a3f8af1a3884efefd83668a401 (diff) |
All functions are running well on windows
-rw-r--r-- | src/GenerateKernelU8S8S32ACC16.cc | 13 | ||||
-rw-r--r-- | src/GenerateKernelU8S8S32ACC16Avx512.cc | 13 | ||||
-rw-r--r-- | src/GenerateKernelU8S8S32ACC32.cc | 17 | ||||
-rw-r--r-- | src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc | 17 | ||||
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 17 | ||||
-rw-r--r-- | src/PackDepthwiseConvMatrixAvx2.cc | 4 | ||||
-rw-r--r-- | test/GConvTest.cc | 8 | ||||
-rw-r--r-- | test/Im2ColFusedRequantizeTest.cc | 5 | ||||
-rw-r--r-- | test/PackedRequantizeAcc16Test.cc | 8 | ||||
-rw-r--r-- | test/PackedRequantizeTest.cc | 8 | ||||
-rw-r--r-- | test/RequantizeOnlyTest.cc | 4 | ||||
-rw-r--r-- | test/TestUtils.h | 9 | ||||
-rw-r--r-- | test/UniConvTest.cc | 5 |
13 files changed, 92 insertions, 36 deletions
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc index cbd5877..205af14 100644 --- a/src/GenerateKernelU8S8S32ACC16.cc +++ b/src/GenerateKernelU8S8S32ACC16.cc @@ -105,7 +105,11 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs< extractDest128, CRegs(i * leadingDimCReg + j), idx); a->vpmovsxwd(extractDest256, extractDest128); x86::Mem destAddr = x86::dword_ptr( +#ifdef _MSC_VER + a->gpz(9), C_Offset, 0, (j * 2 + idx) * 8 * sizeof(int32_t)); +#else a->zcx(), C_Offset, 0, (j * 2 + idx) * 8 * sizeof(int32_t)); +#endif if (accum) { a->vpaddd(extractDest256, extractDest256, destAddr); } @@ -191,12 +195,21 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx2>( //"nc must be equal to the number of register blocks"); // arguments to the function created +#ifdef _MSC_VER + x86::Gp buffer_A = a->zcx(); + x86::Gp buffer_B = a->zdx(); + x86::Gp B_pf = a->gpz(8); + x86::Gp CBase = a->gpz(9); + x86::Gp kSize = a->zdi(); + x86::Gp ldcReg = a->zsi(); +#else x86::Gp buffer_A = a->zdi(); x86::Gp buffer_B = a->zsi(); x86::Gp B_pf = a->zdx(); x86::Gp CBase = a->zcx(); x86::Gp kSize = a->gpz(8); x86::Gp ldcReg = a->gpz(9); +#endif asmjit::FuncDetail func; func.init( diff --git a/src/GenerateKernelU8S8S32ACC16Avx512.cc b/src/GenerateKernelU8S8S32ACC16Avx512.cc index 512c8ba..819f33b 100644 --- a/src/GenerateKernelU8S8S32ACC16Avx512.cc +++ b/src/GenerateKernelU8S8S32ACC16Avx512.cc @@ -112,7 +112,11 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs< extractDest256, CRegs(i * leadingDimCReg + j), idx); a->vpmovsxwd(extractDest512, extractDest256); x86::Mem destAddr = x86::dword_ptr( +#ifdef _MSC_VER + a->gpz(9), C_Offset, 0, (j * 2 + idx) * 16 * sizeof(int32_t)); +#else a->zcx(), C_Offset, 0, (j * 2 + idx) * 16 * sizeof(int32_t)); +#endif if (accum) { a->vpaddd(extractDest512, extractDest512, destAddr); } @@ -203,12 +207,21 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx512>( int mRegBlocksRem = mc % mRegBlockSize; // arguments to the function created +#ifdef _MSC_VER + x86::Gp buffer_A = a->zcx(); + x86::Gp buffer_B = a->zdx(); + x86::Gp B_pf = a->gpz(8); + x86::Gp CBase = a->gpz(9); + x86::Gp kSize = a->zdi(); + x86::Gp ldcReg = a->zsi(); +#else x86::Gp buffer_A = a->zdi(); x86::Gp buffer_B = a->zsi(); x86::Gp B_pf = a->zdx(); x86::Gp CBase = a->zcx(); x86::Gp kSize = a->gpz(8); x86::Gp ldcReg = a->gpz(9); +#endif asmjit::FuncDetail func; func.init( diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc index 226e974..dc9c534 100644 --- a/src/GenerateKernelU8S8S32ACC32.cc +++ b/src/GenerateKernelU8S8S32ACC32.cc @@ -107,10 +107,18 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs< a->vpaddd( CRegs(i * leadingDimCReg + j), CRegs(i * leadingDimCReg + j), +#ifdef _MSC_VER + x86::dword_ptr(a->gpz(9), C_Offset, 0, j * 8 * sizeof(int32_t))); +#else x86::dword_ptr(a->zcx(), C_Offset, 0, j * 8 * sizeof(int32_t))); +#endif } a->vmovups( +#ifdef _MSC_VER + x86::dword_ptr(a->gpz(9), C_Offset, 0, j * 8 * sizeof(int32_t)), +#else x86::dword_ptr(a->zcx(), C_Offset, 0, j * 8 * sizeof(int32_t)), +#endif CRegs(i * leadingDimCReg + j)); } } @@ -188,12 +196,21 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx2>( int mRegBlocksRem = mc % mRegBlockSize; // arguments to the function created +#ifdef _MSC_VER + x86::Gp buffer_A = a->zcx(); + x86::Gp buffer_B = a->zdx(); + x86::Gp B_pf = a->gpz(8); + x86::Gp CBase = a->gpz(9); + x86::Gp kSize = a->zdi(); + x86::Gp ldcReg = a->zsi(); +#else x86::Gp buffer_A = a->zdi(); x86::Gp buffer_B = a->zsi(); x86::Gp B_pf = a->zdx(); x86::Gp CBase = a->zcx(); x86::Gp kSize = a->gpz(8); x86::Gp ldcReg = a->gpz(9); +#endif asmjit::FuncDetail func; func.init( diff --git a/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc b/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc index 1d23e90..bd8be1f 100644 --- a/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc +++ b/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc @@ -98,10 +98,18 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs< a->vpaddd( CRegs(i * leadingDimCReg + j), CRegs(i * leadingDimCReg + j), +#ifdef _MSC_VER + x86::dword_ptr(a->gpz(9), C_Offset, 0, j * 16 * sizeof(int32_t))); +#else x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t))); +#endif } a->vmovups( +#ifdef _MSC_VER + x86::dword_ptr(a->gpz(9), C_Offset, 0, j * 16 * sizeof(int32_t)), +#else x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)), +#endif CRegs(i * leadingDimCReg + j)); } } @@ -190,12 +198,21 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate< int mRegBlocksRem = mc % mRegBlockSize; // arguments to the function created +#ifdef _MSC_VER + x86::Gp buffer_A = a->zcx(); + x86::Gp buffer_B = a->zdx(); + x86::Gp B_pf = a->gpz(8); + x86::Gp CBase = a->gpz(9); + x86::Gp kSize = a->zdi(); + x86::Gp ldcReg = a->zsi(); +#else x86::Gp buffer_A = a->zdi(); x86::Gp buffer_B = a->zsi(); x86::Gp B_pf = a->zdx(); x86::Gp CBase = a->zcx(); x86::Gp kSize = a->gpz(8); x86::Gp ldcReg = a->gpz(9); +#endif asmjit::FuncDetail func; func.init( diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index d1e0fdd..396e792 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -1010,12 +1010,21 @@ jit_conv_kernel_fp GenConvKernel<2, int32_t>::getOrCreate<inst_set_t::avx2>( #endif // arguments to the function created +#ifdef _MSC_VER + in_acts_R_ = a->zcx(); + wghts_R_ = a->zdx(); + out_acts_R_ = a->gpz(8); + a_zero_pt_R_ = a->gpz(9); + H_R_ = a->zdi(); + W_R_ = a->zsi(); +#else in_acts_R_ = a->zdi(); wghts_R_ = a->zsi(); out_acts_R_ = a->zdx(); a_zero_pt_R_ = a->zcx(); H_R_ = a->gpz(8); W_R_ = a->gpz(9); +#endif row_offset_R_ = a->gpz(10); // register for temporary use @@ -1492,11 +1501,19 @@ GenConvKernel<2, int32_t>::getOrCreateRowOffset<inst_set_t::avx2>( #endif // arguments to the function created +#ifdef _MSC_VER + in_acts_R_ = a->zcx(); + a_zero_pt_R_ = a->zdx(); + H_R_ = a->gpz(8); + W_R_ = a->gpz(9); + row_offset_R_ = a->zdi(); +#else in_acts_R_ = a->zdi(); a_zero_pt_R_ = a->zsi(); H_R_ = a->zdx(); W_R_ = a->zcx(); row_offset_R_ = a->gpz(8); +#endif // register for temporary use scratchReg1_ = a->gpz(12); diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index ab2e1f2..04c08f3 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -205,7 +205,11 @@ void PackedDepthWiseConvMatrix::unpack(int8_t* unpacked_data) { } PackedDepthWiseConvMatrix::~PackedDepthWiseConvMatrix() { +#ifdef _MSC_VER + _aligned_free(pmat_); +#else free(pmat_); +#endif } } // namespace fbgemm diff --git a/test/GConvTest.cc b/test/GConvTest.cc index 8c1fb82..982208b 100644 --- a/test/GConvTest.cc +++ b/test/GConvTest.cc @@ -25,14 +25,6 @@ using namespace std; using namespace fbgemm; -vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; - -vector<QuantizationGranularity> qGranularityVals{ - QuantizationGranularity::TENSOR, - QuantizationGranularity::GROUP, - QuantizationGranularity::OUT_CHANNEL}; - namespace { class fbgemmGConvAcc32Test : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t>> {}; diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index b14303f..56df3c8 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -24,11 +24,6 @@ using namespace std; using namespace fbgemm; -vector<QuantizationGranularity> qGranularityVals{ - QuantizationGranularity::TENSOR, - QuantizationGranularity::GROUP, - QuantizationGranularity::OUT_CHANNEL}; - namespace { class fbgemmIm2colTest : public testing::TestWithParam<tuple<QuantizationGranularity, bool>> {}; diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc index 62b1303..8978150 100644 --- a/test/PackedRequantizeAcc16Test.cc +++ b/test/PackedRequantizeAcc16Test.cc @@ -26,14 +26,6 @@ using namespace std; using namespace fbgemm; -vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; - -vector<QuantizationGranularity> qGranularityVals{ - QuantizationGranularity::TENSOR, - QuantizationGranularity::GROUP, - QuantizationGranularity::OUT_CHANNEL}; - namespace { class fbgemmu8s8acc16WithQuantGranularityTest : public testing::TestWithParam< diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc index 5338243..15e7d55 100644 --- a/test/PackedRequantizeTest.cc +++ b/test/PackedRequantizeTest.cc @@ -25,14 +25,6 @@ using namespace std; using namespace fbgemm; -vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; - -vector<QuantizationGranularity> qGranularityVals{ - QuantizationGranularity::TENSOR, - QuantizationGranularity::GROUP, - QuantizationGranularity::OUT_CHANNEL}; - namespace { class fbgemmu8s8acc32WithQuantGranularityTest : public testing::TestWithParam< diff --git a/test/RequantizeOnlyTest.cc b/test/RequantizeOnlyTest.cc index 2f73d49..94e8e7d 100644 --- a/test/RequantizeOnlyTest.cc +++ b/test/RequantizeOnlyTest.cc @@ -20,7 +20,7 @@ using namespace std; using namespace fbgemm; -vector<QuantizationGranularity> qGranularityVals{ +vector<QuantizationGranularity> qGranularityValsLocal{ QuantizationGranularity::TENSOR, QuantizationGranularity::OUT_CHANNEL}; @@ -42,7 +42,7 @@ INSTANTIATE_TEST_CASE_P( {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 20, 32}), // number of // cols ::testing::Bool(), // fuse relu - ::testing::ValuesIn(qGranularityVals))); // requantization granularity + ::testing::ValuesIn(qGranularityValsLocal))); // requantization granularity /** * Test for float bias diff --git a/test/TestUtils.h b/test/TestUtils.h index 2cb7b88..d320ae2 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -7,9 +7,18 @@ #pragma once #include <cmath> #include <vector> +#include "fbgemm/Fbgemm.h" namespace fbgemm { +static std::vector<matrix_op_t> transposeVals = { matrix_op_t::NoTranspose, + matrix_op_t::Transpose }; + +static std::vector<QuantizationGranularity> qGranularityVals = { + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL }; + /* * @brief Check and validate the buffers for reference and FBGEMM result. */ diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc index cead3a6..e9c7ba5 100644 --- a/test/UniConvTest.cc +++ b/test/UniConvTest.cc @@ -20,11 +20,6 @@ using namespace std; using namespace fbgemm; -vector<QuantizationGranularity> qGranularityVals{ - QuantizationGranularity::TENSOR, - QuantizationGranularity::GROUP, - QuantizationGranularity::OUT_CHANNEL}; - // clang-format off static vector<conv_param_t<>> GetShapes_() { vector<conv_param_t<>> shapes = { |