diff options
author | Zafar Takhirov <zaf@fb.com> | 2020-04-11 01:36:53 +0300 |
---|---|---|
committer | Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> | 2020-04-11 01:39:59 +0300 |
commit | 41e97f33032e4a6a01b648f15c2971ceebe7030c (patch) | |
tree | e321a0eb13a61da50555741ba6962f2bce143db2 | |
parent | 15e3b9c3ada9b027931211f03c434a9b73ca7740 (diff) |
Revert D20956364: Move `transpose_simd` to TransposeUtils.cc
Differential Revision:
D20956364
Original commit changeset: 5ec1c8261a81
fbshipit-source-id: fa1875d9b9d0837c9115f6a624056984c92b2aeb
-rw-r--r-- | BUILD.bazel | 30 | ||||
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | defs.bzl | 15 | ||||
-rw-r--r-- | src/FbgemmBfloat16Convert.cc | 15 | ||||
-rw-r--r-- | src/FbgemmFloat16Convert.cc | 24 | ||||
-rw-r--r-- | src/RefImplementations.cc | 40 | ||||
-rw-r--r-- | src/TransposeUtils.cc | 52 | ||||
-rw-r--r-- | src/Utils.cc | 39 |
8 files changed, 98 insertions, 119 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index 4f75644..6173642 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,19 +1,13 @@ load("@bazel_skylib//lib:paths.bzl", "paths") load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") -load("defs.bzl", "get_fbgemm_avx2_srcs", "get_fbgemm_avx512_srcs", "get_fbgemm_base_srcs", "get_fbgemm_generic_srcs", "get_fbgemm_public_headers", "get_fbgemm_tests") +load("defs.bzl", "get_fbgemm_generic_srcs", "get_fbgemm_avx2_srcs", "get_fbgemm_avx512_srcs", "get_fbgemm_public_headers") cc_library( - name = "fbgemm_base", - srcs = get_fbgemm_base_srcs() + glob(["src/*.h"]), - includes = [ - "src", - ], - deps = [ - ":fbgemm_headers", - "@cpuinfo", - "@asmjit", + name = "fbgemm_src_headers", + hdrs = [ + "src/RefImplementations.h", ], - linkstatic = 1, + include_prefix = "fbgemm", ) cc_library( @@ -26,7 +20,10 @@ cc_library( deps = [ ":fbgemm_avx2", ":fbgemm_avx512", - ":fbgemm_base", + ":fbgemm_headers", + ":fbgemm_src_headers", + "@asmjit", + "@cpuinfo", ], linkstatic = 1, ) @@ -42,7 +39,8 @@ cc_library( "-masm=intel", ], deps = [ - ":fbgemm_base", + ":fbgemm_headers", + "@asmjit", ], linkstatic = 1, ) @@ -60,7 +58,7 @@ cc_library( "-masm=intel", ], deps = [ - ":fbgemm_base", + ":fbgemm_headers", ], linkstatic = 1, ) @@ -99,12 +97,12 @@ cc_library( [ cc_test( name = paths.split_extension(paths.basename(filename))[0], - size = "medium", + size = "small", srcs = [ filename, ], deps = [ ":test_utils", ], - ) for filename in get_fbgemm_tests() + ) for filename in ["test/GConvTest.cc", "test/TransposeTest.cc", "test/QuantUtilsTest.cc", "test/I64Test.cc"] ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 946855c..ce27196 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ else() endif() # Define file lists -get_filelist("get_fbgemm_generic_srcs(with_base=True)" FBGEMM_GENERIC_SRCS) +get_filelist("get_fbgemm_generic_srcs()" FBGEMM_GENERIC_SRCS) get_filelist("get_fbgemm_avx2_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX2_SRCS) get_filelist("get_fbgemm_avx512_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX512_SRCS) get_filelist("get_fbgemm_public_headers()" FBGEMM_PUBLIC_HEADERS) @@ -1,11 +1,4 @@ -def get_fbgemm_base_srcs(): - return [ - "src/GenerateI8Depthwise.cc", - "src/RefImplementations.cc", - "src/Utils.cc", - ] - -def get_fbgemm_generic_srcs(with_base = False): +def get_fbgemm_generic_srcs(): return [ "src/EmbeddingSpMDM.cc", "src/EmbeddingSpMDMNBit.cc", @@ -18,6 +11,7 @@ def get_fbgemm_generic_srcs(with_base = False): "src/FbgemmFloat16Convert.cc", "src/FbgemmI64.cc", "src/FbgemmI8Spmdm.cc", + "src/GenerateI8Depthwise.cc", "src/GenerateKernelU8S8S32ACC16.cc", "src/GenerateKernelU8S8S32ACC16Avx512.cc", # Acc16 AVX512 JIT code gen "src/GenerateKernelU8S8S32ACC16Avx512VNNI.cc", @@ -34,10 +28,11 @@ def get_fbgemm_generic_srcs(with_base = False): "src/PackWeightMatrixForGConv.cc", "src/PackWeightsForConv.cc", "src/QuantUtils.cc", + "src/RefImplementations.cc", "src/RowWiseSparseAdagradFused.cc", "src/SparseAdagrad.cc", - "src/TransposeUtils.cc", - ] + (get_fbgemm_base_srcs() if with_base else []) + "src/Utils.cc", + ] def get_fbgemm_public_headers(): return [ diff --git a/src/FbgemmBfloat16Convert.cc b/src/FbgemmBfloat16Convert.cc index b0b6591..1a528d3 100644 --- a/src/FbgemmBfloat16Convert.cc +++ b/src/FbgemmBfloat16Convert.cc @@ -42,6 +42,21 @@ using namespace std; namespace fbgemm { +void FloatToBfloat16_ref(const float* src, bfloat16* dst, int size) { + for (int i = 0; i < size; i++) { + // Add 2^15 and right shift 16 to do round-nearest + dst[i] = (*reinterpret_cast<const uint32_t*>(src + i) + (1 << 15)) >> 16; + } +} + +void Bfloat16ToFloat_ref(const bfloat16* src, float* dst, int size) { + for (int i = 0; i < size; i++) { + uint32_t val_fp32 = + static_cast<uint32_t>(reinterpret_cast<const uint16_t*>(src)[i]) << 16; + reinterpret_cast<uint32_t*>(dst)[i] = val_fp32; + } +} + void FloatToBfloat16_simd(const float* src, bfloat16* dst, int size) { // Run time CPU detection if (cpuinfo_initialize()) { diff --git a/src/FbgemmFloat16Convert.cc b/src/FbgemmFloat16Convert.cc index a714dff..79e40a4 100644 --- a/src/FbgemmFloat16Convert.cc +++ b/src/FbgemmFloat16Convert.cc @@ -32,6 +32,30 @@ using namespace std; namespace fbgemm { +void FloatToFloat16_ref( + const float* src, + float16* dst, + int size, + bool do_clip) { + constexpr float FP16_MAX = 65504.f; + if (do_clip) { + for (int i = 0; i < size; i++) { + float cur_src = std::max(-FP16_MAX, std::min(src[i], FP16_MAX)); + dst[i] = cpu_float2half_rn(cur_src); + } + } else { + for (int i = 0; i < size; i++) { + dst[i] = cpu_float2half_rn(src[i]); + } + } +} + +void Float16ToFloat_ref(const float16* src, float* dst, int size) { + for (int i = 0; i < size; i++) { + dst[i] = cpu_half2float(src[i]); + } +} + void FloatToFloat16_simd( const float* src, float16* dst, diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 86663ed..d0809db 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -8,7 +8,6 @@ #include "./RefImplementations.h" #include "fbgemm/FbgemmBuild.h" -#include "fbgemm/FbgemmConvert.h" #include "fbgemm/Types.h" #include <algorithm> @@ -20,45 +19,6 @@ using namespace std; namespace fbgemm { -void FloatToFloat16_ref( - const float* src, - float16* dst, - int size, - bool do_clip) { - constexpr float FP16_MAX = 65504.f; - if (do_clip) { - for (int i = 0; i < size; i++) { - float cur_src = std::max(-FP16_MAX, std::min(src[i], FP16_MAX)); - dst[i] = cpu_float2half_rn(cur_src); - } - } else { - for (int i = 0; i < size; i++) { - dst[i] = cpu_float2half_rn(src[i]); - } - } -} - -void Float16ToFloat_ref(const float16* src, float* dst, int size) { - for (int i = 0; i < size; i++) { - dst[i] = cpu_half2float(src[i]); - } -} - -void FloatToBfloat16_ref(const float* src, bfloat16* dst, int size) { - for (int i = 0; i < size; i++) { - // Add 2^15 and right shift 16 to do round-nearest - dst[i] = (*reinterpret_cast<const uint32_t*>(src + i) + (1 << 15)) >> 16; - } -} - -void Bfloat16ToFloat_ref(const bfloat16* src, float* dst, int size) { - for (int i = 0; i < size; i++) { - uint32_t val_fp32 = - static_cast<uint32_t>(reinterpret_cast<const uint16_t*>(src)[i]) << 16; - reinterpret_cast<uint32_t*>(dst)[i] = val_fp32; - } -} - void requantize_u8acc32_ref( int M, int N, diff --git a/src/TransposeUtils.cc b/src/TransposeUtils.cc deleted file mode 100644 index 27afa62..0000000 --- a/src/TransposeUtils.cc +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#define FBGEMM_EXPORTS -#include "./TransposeUtils.h" -#include "fbgemm/Utils.h" -#include <cstring> - -namespace fbgemm { - -void transpose_ref( - int M, - int N, - const float* src, - int ld_src, - float* dst, - int ld_dst) { - for (int j = 0; j < N; j++) { - for (int i = 0; i < M; i++) { - dst[i + j * ld_dst] = src[i * ld_src + j]; - } - } // for each output row -} - -void transpose_simd( - int M, - int N, - const float* src, - int ld_src, - float* dst, - int ld_dst) { - if ((M == 1 && ld_dst == 1) || (N == 1 && ld_src == 1)) { - if (dst != src) { - memcpy(dst, src, M * N * sizeof(float)); - } - return; - } - static const auto iset = fbgemmInstructionSet(); - // Run time CPU detection - if (isZmm(iset)) { - internal::transpose_avx512(M, N, src, ld_src, dst, ld_dst); - } else if (isYmm(iset)) { - internal::transpose_avx2(M, N, src, ld_src, dst, ld_dst); - } else { - transpose_ref(M, N, src, ld_src, dst, ld_dst); - } -} - -} // namespace fbgemm diff --git a/src/Utils.cc b/src/Utils.cc index 51fa2a8..266da6f 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -18,6 +18,7 @@ #include <stdexcept> #include <unordered_map> #include <unordered_set> +#include "./TransposeUtils.h" namespace fbgemm { @@ -169,6 +170,44 @@ template void printMatrix<int32_t>( size_t ld, std::string name); +void transpose_ref( + int M, + int N, + const float* src, + int ld_src, + float* dst, + int ld_dst) { + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + dst[i + j * ld_dst] = src[i * ld_src + j]; + } + } // for each output row +} + +void transpose_simd( + int M, + int N, + const float* src, + int ld_src, + float* dst, + int ld_dst) { + if ((M == 1 && ld_dst == 1) || (N == 1 && ld_src == 1)) { + if (dst != src) { + memcpy(dst, src, M * N * sizeof(float)); + } + return; + } + static const auto iset = fbgemmInstructionSet(); + // Run time CPU detection + if (isZmm(iset)) { + internal::transpose_avx512(M, N, src, ld_src, dst, ld_dst); + } else if (isYmm(iset)) { + internal::transpose_avx2(M, N, src, ld_src, dst, ld_dst); + } else { + transpose_ref(M, N, src, ld_src, dst, ld_dst); + } +} + namespace { inst_set_t g_forced_isa = inst_set_t::anyarch; bool g_Avx512_Ymm_enabled = false; |