diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-09-26 19:17:42 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-09-26 19:17:42 +0300 |
commit | 21f93c950b8b27918cd59c8f3139fb41ad1bd2c6 (patch) | |
tree | 4ee82a52176dcd658db70612e2d8c045cac66298 | |
parent | 13347764fc33d3220856185c4758059a8e43cd14 (diff) |
Linux memory fix
-rw-r--r-- | src/PackDepthwiseConvMatrixAvx2.cc | 22 |
1 files changed, 7 insertions, 15 deletions
diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index 840a654..a84c469 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -46,15 +46,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // Allocate packed arrays int kernel_prod_aligned = (kernel_prod + 1) / 2 * 2; - //pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(64, ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t))); -#ifdef _MSC_VER - pmat_ = (int8_t*)_aligned_malloc(((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t), 64); -#else - posix_memalign( - (void**)&pmat_, - 64, - ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t)); -#endif + pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(64, ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t))); // Pack input matrix // The layout is optimized to use vpmaddubsw efficiently (see @@ -109,7 +101,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero for (int k1 = 0; k1 < K; k1 += 32) { - __m256i* b_v = new __m256i[kernel_prod]; + __m256i* b_v = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod * sizeof(__m256i), 64)); int remainder = K - k1; if (remainder < 32) { __m256i mask_v = _mm256_loadu_si256( @@ -126,7 +118,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 2 SIMD registers - __m256i* b_interleaved_epi16 = new __m256i[kernel_prod_aligned]; + __m256i* b_interleaved_epi16 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64)); __m256i zero_v = _mm256_setzero_si256(); for (int i = 0; i < kernel_prod_aligned / 2; ++i) { if (2 * i + 1 >= kernel_prod) { @@ -142,7 +134,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 4 SIMD registers - __m256i* b_interleaved_epi32 = new __m256i[kernel_prod_aligned]; + __m256i* b_interleaved_epi32 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64)); for (int i = 0; i < kernel_prod_aligned / 4; ++i) { b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16( b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]); @@ -164,9 +156,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( b_interleaved_epi32[i]); } - delete[] b_v; - delete[] b_interleaved_epi16; - delete[] b_interleaved_epi32; + FREE(b_v); + FREE(b_interleaved_epi16); + FREE(b_interleaved_epi32); } FREE(smat_transposed); } |