diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2020-04-10 08:55:34 +0300 |
---|---|---|
committer | Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> | 2020-04-10 08:59:35 +0300 |
commit | 20b6cf14e2b547987234e39d922bc22ec541c3b4 (patch) | |
tree | 5459828c84d0d8dffdfe5a5b69f86557824abd4b | |
parent | f1d333089f9442e48af73abe73148516a3bab110 (diff) |
Move FakeFP16 back to internal to remove dependency on MKL (#36297)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/36297
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/343
We moved FakeFP16 back to close source and kept `RoundToFloat16` function in "fbgemm/FbgemmConvert.h".
This is because FakeFP16 introduced dependency on MKL in the FBGEMM core. Also it doesn't seem to be needed for open source, as it is not used anywhere.
Reviewed By: jspark1105
Differential Revision: D20937962
fbshipit-source-id: 9487a9fd2282b6df2f754c22bea36f2255a5c791
-rw-r--r-- | CMakeLists.txt | 4 | ||||
-rw-r--r-- | bench/BenchUtils.cc | 6 | ||||
-rw-r--r-- | bench/FakeFP16Benchmark.cc | 315 | ||||
-rw-r--r-- | defs.bzl | 8 | ||||
-rw-r--r-- | include/fbgemm/FbgemmConvert.h | 11 | ||||
-rw-r--r-- | include/fbgemm/FbgemmFakeFP16.h | 59 | ||||
-rw-r--r-- | src/FbgemmFakeFP16.cc | 214 | ||||
-rw-r--r-- | src/FbgemmFloat16Convert.cc | 34 | ||||
-rw-r--r-- | test/CMakeLists.txt | 7 | ||||
-rw-r--r-- | test/FakeFP16Test.cc | 169 |
10 files changed, 54 insertions, 773 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 01185a1..ce27196 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,10 +68,10 @@ else() endif() # Define file lists -get_filelist("get_fbgemm_generic_srcs(with_fp16=True)" FBGEMM_GENERIC_SRCS) +get_filelist("get_fbgemm_generic_srcs()" FBGEMM_GENERIC_SRCS) get_filelist("get_fbgemm_avx2_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX2_SRCS) get_filelist("get_fbgemm_avx512_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX512_SRCS) -get_filelist("get_fbgemm_public_headers(with_fp16=True)" FBGEMM_PUBLIC_HEADERS) +get_filelist("get_fbgemm_public_headers()" FBGEMM_PUBLIC_HEADERS) add_library(fbgemm_generic OBJECT ${FBGEMM_GENERIC_SRCS}) add_library(fbgemm_avx2 OBJECT ${FBGEMM_AVX2_SRCS}) diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc index 1b99096..1c16860 100644 --- a/bench/BenchUtils.cc +++ b/bench/BenchUtils.cc @@ -6,8 +6,8 @@ */ #include "./BenchUtils.h" -#include <cstring> #include <algorithm> +#include <cstring> #include <random> #include <type_traits> @@ -43,8 +43,8 @@ randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high); template void randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high); template void randFill<int>(aligned_vector<int>& vec, int low, int high); -//template void -//randFill<int64_t>(aligned_vector<int64_t>& vec, int64_t low, int64_t high); +// template void +// randFill<int64_t>(aligned_vector<int64_t>& vec, int64_t low, int64_t high); template <> void randFill(aligned_vector<int64_t>& vec, int64_t low, int64_t high) { std::uniform_int_distribution<int64_t> dis(low, high); diff --git a/bench/FakeFP16Benchmark.cc b/bench/FakeFP16Benchmark.cc deleted file mode 100644 index 079d87d..0000000 --- a/bench/FakeFP16Benchmark.cc +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#include <chrono> -#include <cmath> -#include <random> - -#ifdef USE_MKL -#include <mkl.h> -#endif - -#ifdef USE_BLAS -#if __APPLE__ -// not sure whether need to differentiate TARGET_OS_MAC or TARGET_OS_IPHONE, -// etc. -#include <Accelerate/Accelerate.h> -#else -#include <cblas.h> -#endif -#endif - -#ifdef _OPENMP -#include <omp.h> -#endif - -#include "bench/AlignedVec.h" -#include "bench/BenchUtils.h" -#include "fbgemm/FbgemmFakeFP16.h" - -#include <iomanip> -#include <iostream> - -using namespace std; -using namespace fbgemm; - -void performance_test() { - // cache flush - bool flush = true; - std::vector<char> llc; - if (flush) { - llc.resize(64L * 1024L * 1024L, 1.0); - } - - float alpha = 1.f, beta = 1.f; - matrix_op_t atrans = matrix_op_t::NoTranspose; - matrix_op_t btrans = matrix_op_t::Transpose; - -#define dataset 1 - -#if dataset == 1 - const int NITER = (flush) ? 10 : 100; - std::vector<std::vector<int>> shapes; - for (auto m = 1; m < 120; m++) { - // shapes.push_back({m, 128, 512}); - // shapes.push_back({m, 512, 512}); - shapes.push_back({m, 1024, 1024}); - } -#else - flush = false; - constexpr int NITER = 1; - std::vector<std::vector<int>> shapes; - std::random_device r; - std::default_random_engine generator(r()); - std::uniform_int_distribution<int> dm(1, 100); - std::uniform_int_distribution<int> dnk(1, 1024); - for (int i = 0; i < 1000; i++) { - int m = dm(generator); - int n = dnk(generator); - int k = dnk(generator); - shapes.push_back({m, n, k}); - } -#endif - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - cout << "WARNING: the timer may be inaccurate when used by multiple threads." - << endl; - cout << "M, " - << "N, " - << "K, " - << "Malloc (ms), " - << "A fp16->fp32 (ms), " - << "B fp16->fp32 (ms), " - << "C fp16->fp32 (ms), " - << "GEMM fp32 (ms), " - << "C fp32->fp16 (ms), " - << "Total (ms), " - << "GOPS, " - << "GB/s" << endl; -#else - cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(32) - << "Type, " << setw(18) << "GOPS, " << setw(5) << "GB/s" << endl; -#endif - - std::string type; - double gflops, gbs, ttot; - for (auto s : shapes) { - int m = s[0]; - int n = s[1]; - int k = s[2]; - - // initialize with small numbers - aligned_vector<float> A_ref(m * k); - aligned_vector<float> B_ref(k * n); - randFill(A_ref, 0.0f, 1.0f); - randFill(B_ref, 0.0f, 1.0f); - - aligned_vector<float16> A_float16(m * k); - aligned_vector<float16> B_float16(k * n); - aligned_vector<float16> C_float16(m * n); - - FloatToFloat16_ref(A_ref.data(), A_float16.data(), m * k); - FloatToFloat16_ref(B_ref.data(), B_float16.data(), k * n); - - aligned_vector<float> C_ref(m * n, NAN); - aligned_vector<float> C_fb(C_ref); - - if (beta != 0.0f) { - randFill(C_ref, 0.0f, 1.0f); - FloatToFloat16_ref(C_ref.data(), C_float16.data(), m * n); - C_fb = C_ref; - } - - double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER; - double nbytes = (4.0 * (double)m * (double)k + 2.0 * (double)k * (double)n + - 4.0 * (double)m * (double)n) * - NITER; - - // warm up MKL and fbgemm - // check correctness at the same time - for (auto w = 0; w < 3; w++) { -#if defined(USE_MKL) || defined(USE_BLAS) - cblas_sgemm( - CblasRowMajor, - atrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans, - btrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans, - m, - n, - k, - alpha, - A_ref.data(), - k, - B_ref.data(), - (btrans == matrix_op_t::NoTranspose) ? n : k, - beta, - C_ref.data(), - n); -#endif - - fbgemmFakeFP16( - atrans, - btrans, - m, - n, - k, - A_float16.data(), - B_float16.data(), - beta, - C_float16.data()); - Float16ToFloat_ref(C_float16.data(), C_fb.data(), m * n); - -#if defined(USE_MKL) || defined(USE_BLAS) - // Compare results - for (auto i = 0; i < C_ref.size(); i++) { - if (std::fabs(C_ref[i] - C_fb[i]) / C_ref[i] > k * 1.0 / 128) { - fprintf( - stderr, - "Error: too high diff between fp32 ref %f and float16 %f\n", - C_ref[i], - C_fb[i]); - return; - } - } -#endif - } - - chrono::time_point<chrono::system_clock> t_begin, t_end; -#if defined(USE_MKL) || defined(USE_BLAS) - // Gold via MKL sgemm -#if defined(USE_MKL) - type = "MKL_FP32"; -#else - type = "BLAS_FP32"; -#endif - ttot = 0; - for (auto it = -3; it < NITER; it++) { - if (flush) { - for (auto i = 0; i < llc.size(); i++) { - llc[i]++; - } - } - t_begin = chrono::system_clock::now(); - cblas_sgemm( - CblasRowMajor, - atrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans, - btrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans, - m, - n, - k, - alpha, - A_ref.data(), - k, - B_ref.data(), - (btrans == matrix_op_t::NoTranspose) ? n : k, - beta, - C_ref.data(), - n); - t_end = chrono::system_clock::now(); - if (it >= 0) { - double dt = chrono::duration<double>(t_end - t_begin).count(); - ttot += dt; - } - } - gflops = nflops / ttot / 1e9; - gbs = nbytes / ttot / 1e9; - - cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k << ", " - << type.c_str() << ", " << setw(5) << fixed << setw(5) - << setprecision(3) << gflops << ", " << setprecision(3) << gbs << endl; - ((volatile char*)(llc.data())); -#endif - - type = "FBGEMM_Float16"; - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - double total_malloc_time = 0.0; - double total_A_fp16_to_fp32_time = 0.0; - double total_B_fp16_to_fp32_time = 0.0; - double total_C_fp16_to_fp32_time = 0.0; - double total_computing_time = 0.0; - double total_C_fp32_to_fp16_time = 0.0; - double total_run_time = 0.0; -#endif - cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k << ", " - << type.c_str() << ", "; - - ttot = 0; - for (auto it = -3; it < NITER; it++) { -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - malloc_time = 0.0; - A_fp16_to_fp32_time = 0.0; - B_fp16_to_fp32_time = 0.0; - C_fp16_to_fp32_time = 0.0; - computing_time = 0.0; - C_fp32_to_fp16_time = 0.0; - run_time = 0.0; -#endif - - if (flush) { - for (auto i = 0; i < llc.size(); i++) { - llc[i]++; - } - } - - t_begin = chrono::system_clock::now(); - fbgemmFakeFP16( - atrans, - btrans, - m, - n, - k, - A_float16.data(), - B_float16.data(), - beta, - C_float16.data()); - t_end = chrono::system_clock::now(); - Float16ToFloat_ref(C_float16.data(), C_fb.data(), m * n); - if (it >= 0) { - double dt = chrono::duration<double>(t_end - t_begin).count(); - ttot += dt; - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - total_malloc_time += malloc_time; - total_A_fp16_to_fp32_time += A_fp16_to_fp32_time; - total_B_fp16_to_fp32_time += B_fp16_to_fp32_time; - total_C_fp16_to_fp32_time += C_fp16_to_fp32_time; - total_computing_time += computing_time; - total_C_fp32_to_fp16_time += C_fp32_to_fp16_time; - total_run_time += run_time; -#endif - } - } - gflops = nflops / ttot / 1e9; - gbs = nbytes / ttot / 1e9; - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - cout << fixed << total_malloc_time / (double)NITER / 1e6 << ", " - << total_A_fp16_to_fp32_time / (double)NITER / 1e6 << ", " - << total_B_fp16_to_fp32_time / (double)NITER / 1e6 << ", " - << total_C_fp16_to_fp32_time / (double)NITER / 1e6 << ", " - << total_computing_time / (double)NITER / 1e6 << ", " - << total_C_fp32_to_fp16_time / (double)NITER / 1e6 << ", "; -#endif - - cout << setw(5) << fixed << setw(5) << setprecision(3) - << ttot / NITER * 1000 << ", " << setprecision(3) << gflops << ", " - << setprecision(3) << gbs << endl; - cout << endl; - - ((volatile char*)(llc.data())); - } -} - -int main(int /*argc*/, char** /*argv*/) { -#ifdef _OPENMP - // Use 1 thread unless OMP_NUM_THREADS is explicit set. - const char* val = getenv("OMP_NUM_THREADS"); - if (val == nullptr || !*val) { - omp_set_num_threads(1); - } -#endif - performance_test(); -} @@ -1,4 +1,4 @@ -def get_fbgemm_generic_srcs(with_fp16 = False): +def get_fbgemm_generic_srcs(): return [ "src/EmbeddingSpMDM.cc", "src/EmbeddingSpMDMNBit.cc", @@ -32,9 +32,9 @@ def get_fbgemm_generic_srcs(with_fp16 = False): "src/RowWiseSparseAdagradFused.cc", "src/SparseAdagrad.cc", "src/Utils.cc", - ] + (["src/FbgemmFakeFP16.cc"] if with_fp16 else []) + ] -def get_fbgemm_public_headers(with_fp16 = False): +def get_fbgemm_public_headers(): return [ "include/fbgemm/Fbgemm.h", "include/fbgemm/FbgemmBuild.h", @@ -52,7 +52,7 @@ def get_fbgemm_public_headers(with_fp16 = False): "include/fbgemm/ConvUtils.h", "include/fbgemm/Types.h", "include/fbgemm/FbgemmI8Spmdm.h", - ] + (["include/fbgemm/FbgemmFakeFP16.h"] if with_fp16 else []) + ] def get_fbgemm_avx2_srcs(msvc = False): return [ diff --git a/include/fbgemm/FbgemmConvert.h b/include/fbgemm/FbgemmConvert.h index 2df5004..76eda02 100644 --- a/include/fbgemm/FbgemmConvert.h +++ b/include/fbgemm/FbgemmConvert.h @@ -136,4 +136,15 @@ FBGEMM_API void Float16ToFloat_avx2(const float16* src, float* dst, int size); */ FBGEMM_API void Float16ToFloat_avx512(const float16* src, float* dst, int size); +/** + * @brief Transform all entries in a matrix from fp32 to float16 and back to + * fp32. + */ +FBGEMM_API void RoundToFloat16( + const float* input, + float* output, + int len, + bool clamp = false, + bool clamp_denorms = false); + }; // namespace fbgemm diff --git a/include/fbgemm/FbgemmFakeFP16.h b/include/fbgemm/FbgemmFakeFP16.h deleted file mode 100644 index 3a9dc07..0000000 --- a/include/fbgemm/FbgemmFakeFP16.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#pragma once - -#include "fbgemm/FbgemmConvert.h" -#include "fbgemm/Types.h" -#include "fbgemm/Utils.h" - -// Turning on this option will print out time breakdown of each stage (e.g., -// pre-processing fp32 -> fp16 conversion, the main GEMM kernel, post-processing -// fp16 -> fp32 conversion). Please note that currently this option won't report -// accurate timing if multiple threads are used. #define -// FBGEMM_MEASURE_TIME_BREAKDOWN - -// #define FBGEMM_MEASURE_TIME_BREAKDOWN - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN -#include <chrono> -#include <iostream> -extern double malloc_time; -extern double A_fp16_to_fp32_time; -extern double B_fp16_to_fp32_time; -extern double C_fp16_to_fp32_time; -extern double computing_time; -extern double C_fp32_to_fp16_time; -extern double run_time; -#endif - -namespace fbgemm { - -// typedef uint16_t float16; - -/** - * @ Transform all entries in a matrix from fp32 to float16 and back to fp32 - * - */ -FBGEMM_API void RoundToFloat16( - const float* input, - float* output, - int len, - bool clamp = false, - bool clamp_denorms = false); - -FBGEMM_API void fbgemmFakeFP16( - const matrix_op_t transa, - const matrix_op_t transb, - int m, - int n, - int k, - const float16* A_float16, - const float16* B_float16, - float beta, - float16* C_float16); - -}; // namespace fbgemm diff --git a/src/FbgemmFakeFP16.cc b/src/FbgemmFakeFP16.cc deleted file mode 100644 index 161653c..0000000 --- a/src/FbgemmFakeFP16.cc +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#include "fbgemm/FbgemmFakeFP16.h" -#include "fbgemm/FbgemmConvert.h" - -#include "src/RefImplementations.h" - -#ifdef USE_MKL -#include <mkl.h> -#endif - -#ifdef USE_BLAS -#include <cblas.h> -#endif - -#include <cpuinfo.h> -#include <memory> -#include <utility> -#include <vector> - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN -double malloc_time = 0.0; -double A_fp16_to_fp32_time = 0.0; -double B_fp16_to_fp32_time = 0.0; -double C_fp16_to_fp32_time = 0.0; -double computing_time = 0.0; -double C_fp32_to_fp16_time = 0.0; -double run_time = 0.0; -#endif - -using namespace std; - -namespace fbgemm { - -void RoundToFloat16( - const float* input, - float* output, - int size, - bool clamp, - bool clamp_denorms) { - std::vector<fbgemm::float16> data_fp16(size); - fbgemm::FloatToFloat16_simd(input, &(data_fp16[0]), size); - fbgemm::Float16ToFloat_simd(&(data_fp16[0]), output, size); - - if (clamp) { - // TODO: Use intrinsics to optimize clamping performance. - for (int i = 0; i < size; ++i) { - output[i] = std::max(std::min(output[i], 65504.0f), -65504.0f); - } - } - - if (clamp_denorms) { - union epsilon_t { - float f; - uint32_t i; - }; - - union epsilon_t epsilon; - epsilon.i = 0x38800000u; // 1 / 16384 - - for (int i = 0; i < size; ++i) { - if (std::abs(output[i]) < epsilon.f) { - output[i] = 0.0; - } - } - } -} - -void fbgemmFakeFP16( - const matrix_op_t transa, - const matrix_op_t transb, - int m, - int n, - int k, - const float16* A_float16, - const float16* B_float16, - float beta, - float16* C_float16) { -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start, - t_start, t_end; - double dt; - t_start = std::chrono::high_resolution_clock::now(); - t_very_start = std::chrono::high_resolution_clock::now(); -#endif - - // float16 -> fp32 - float* A_fp32 = - static_cast<float*>(fbgemmAlignedAlloc(64, m * k * sizeof(float))); - float* B_fp32 = - static_cast<float*>(fbgemmAlignedAlloc(64, k * n * sizeof(float))); - float* C_fp32 = - static_cast<float*>(fbgemmAlignedAlloc(64, m * n * sizeof(float))); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - malloc_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif - - int lda = (transa == matrix_op_t::NoTranspose ? k : m); - int ldb = (transb == matrix_op_t::NoTranspose ? n : k); - int ldc = n; - - // Float16ToFloat_ref(A_float16, A_fp32, m * k); - Float16ToFloat_simd(A_float16, A_fp32, m * k); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - A_fp16_to_fp32_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif - - // Float16ToFloat_ref(B_float16, B_fp32, k * n); - Float16ToFloat_simd(B_float16, B_fp32, k * n); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - B_fp16_to_fp32_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif - - if (beta != 0.0f) { - // Float16ToFloat_ref(C_float16, C_fp32, m * n); - Float16ToFloat_simd(C_float16, C_fp32, m * n); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - C_fp16_to_fp32_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif - } - - // fp32 GEMM - float alpha = 1.f; -#if defined(USE_MKL) || defined(USE_BLAS) - cblas_sgemm( - CblasRowMajor, - transa == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans, - transb == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans, - m, - n, - k, - alpha, - A_fp32, - lda, - B_fp32, - ldb, - beta, - C_fp32, - ldc); -#else - cblas_sgemm_ref( - transa, - transb, - m, - n, - k, - alpha, - A_fp32, - lda, - B_fp32, - ldb, - beta, - C_fp32, - ldc); -#endif - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - computing_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif - - // fp32 -> float16 - // FloatToFloat16_ref(C_fp32, C_float16, m * n); - FloatToFloat16_simd(C_fp32, C_float16, m * n); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - C_fp32_to_fp16_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif - - free(A_fp32); - free(B_fp32); - free(C_fp32); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = - std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_very_start) - .count(); - run_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); -#endif -} -} // namespace fbgemm diff --git a/src/FbgemmFloat16Convert.cc b/src/FbgemmFloat16Convert.cc index 0373e62..79e40a4 100644 --- a/src/FbgemmFloat16Convert.cc +++ b/src/FbgemmFloat16Convert.cc @@ -92,4 +92,38 @@ void Float16ToFloat_simd(const float16* src, float* dst, int size) { } } +void RoundToFloat16( + const float* input, + float* output, + int size, + bool clamp, + bool clamp_denorms) { + std::vector<fbgemm::float16> data_fp16(size); + FloatToFloat16_simd(input, &(data_fp16[0]), size); + Float16ToFloat_simd(&(data_fp16[0]), output, size); + + if (clamp) { + // TODO: Use intrinsics to optimize clamping performance. + for (int i = 0; i < size; ++i) { + output[i] = std::max(std::min(output[i], 65504.0f), -65504.0f); + } + } + + if (clamp_denorms) { + union epsilon_t { + float f; + uint32_t i; + }; + + union epsilon_t epsilon; + epsilon.i = 0x38800000u; // 1 / 16384 + + for (int i = 0; i < size; ++i) { + if (std::abs(output[i]) < epsilon.f) { + output[i] = 0.0; + } + } + } +} + } // namespace fbgemm diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b8ee9fe..336b0e8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -70,13 +70,6 @@ endmacro() file(GLOB TEST_LIST "*Test.cc") -# Temporary exclude Float16Test.cc on windows platform until -# we figure out how to repro and debug the issue -if(MSVC) - file(GLOB BLACKLIST_MSVC "*FakeFP16Test.cc") - list(REMOVE_ITEM TEST_LIST "${BLACKLIST_MSVC}") -endif() - foreach(TEST_FILE ${TEST_LIST}) get_filename_component(TEST_NAME "${TEST_FILE}" NAME_WE) get_filename_component(TEST_FILE_ONLY "${TEST_FILE}" NAME) diff --git a/test/FakeFP16Test.cc b/test/FakeFP16Test.cc deleted file mode 100644 index bccb327..0000000 --- a/test/FakeFP16Test.cc +++ /dev/null @@ -1,169 +0,0 @@ -#include <gtest/gtest.h> -#include <cmath> -#include <random> - -#ifdef USE_MKL -#include <mkl.h> -#endif - -#ifdef USE_BLAS -#include <cblas.h> -#endif - -#ifdef _OPENMP -#include <omp.h> -#endif - -#include "bench/BenchUtils.h" -#include "fbgemm/FbgemmFakeFP16.h" -#include "src/RefImplementations.h" - -using namespace std; -using namespace fbgemm; - -namespace { -// The template parameter is transpose of A and B -class FBGemmFloat16Test - : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {}; -}; // namespace - -INSTANTIATE_TEST_CASE_P( - InstantiationName, - FBGemmFloat16Test, - ::testing::Values( - pair<matrix_op_t, matrix_op_t>( - matrix_op_t::NoTranspose, - matrix_op_t::NoTranspose), - pair<matrix_op_t, matrix_op_t>( - matrix_op_t::NoTranspose, - matrix_op_t::Transpose), - pair<matrix_op_t, matrix_op_t>( - matrix_op_t::Transpose, - matrix_op_t::NoTranspose), - pair<matrix_op_t, matrix_op_t>( - matrix_op_t::Transpose, - matrix_op_t::Transpose))); - -TEST_P(FBGemmFloat16Test, Test) { -#ifdef _OPENMP - // Use 1 thread unless OMP_NUM_THREADS is explicit set. - const char* val = getenv("OMP_NUM_THREADS"); - if (val == nullptr || !*val) { - omp_set_num_threads(1); - } -#endif - vector<vector<int>> shapes; - random_device r; - default_random_engine generator(r()); - uniform_int_distribution<int> dm(1, 256); - uniform_int_distribution<int> dnk(1, 1024); - for (int i = 0; i < 5; i++) { - int m = dm(generator); - int n = dnk(generator); - int k = dnk(generator); - shapes.push_back({m, n, k}); - } - - float alpha = 1.f, beta = 0.f; - matrix_op_t atrans, btrans; - tie(atrans, btrans) = GetParam(); - - for (auto s : shapes) { - int m = s[0]; - int n = s[1]; - int k = s[2]; - - cerr << "m = " << m << " n = " << n << " k = " << k; - if (atrans == matrix_op_t::Transpose) { - cerr << " A_transposed"; - } - if (btrans == matrix_op_t::Transpose) { - cerr << " B_transposed"; - } - cerr << endl; - - // initialize with small numbers - aligned_vector<float> A_ref(m * k); - aligned_vector<float> B_ref(k * n); - randFill(A_ref, 0.0f, 4.0f); - randFill(B_ref, 0.0f, 4.0f); - // randFill(A_ref, -1.0f, 1.0f); - // randFill(B_ref, -1.0f, 1.0f); - - aligned_vector<float16> A_float16(m * k); - aligned_vector<float16> B_float16(k * n); - aligned_vector<float16> C_float16(m * n); - - aligned_vector<float> C_ref(m * n, NAN); - aligned_vector<float> C_fb(C_ref); - - FloatToFloat16_ref(A_ref.data(), A_float16.data(), m * k); - FloatToFloat16_ref(B_ref.data(), B_float16.data(), k * n); - - if (beta != 0.0f) { - randFill(C_ref, 0.0f, 4.0f); - FloatToFloat16_ref(C_ref.data(), C_float16.data(), m * n); - C_fb = C_ref; - } - - // Gold via reference sgemm -#if defined(USE_MKL) || defined(USE_BLAS) - cblas_sgemm( - CblasRowMajor, - atrans == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans, - btrans == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans, - m, - n, - k, - alpha, - A_ref.data(), - atrans == matrix_op_t::NoTranspose ? k : m, - B_ref.data(), - btrans == matrix_op_t::NoTranspose ? n : k, - beta, - C_ref.data(), - n); -#else - cblas_sgemm_ref( - atrans, - btrans, - m, - n, - k, - alpha, - A_ref.data(), - atrans == matrix_op_t::NoTranspose ? k : m, - B_ref.data(), - btrans == matrix_op_t::NoTranspose ? n : k, - beta, - C_ref.data(), - n); -#endif - - // fbgemm float16 - fbgemmFakeFP16( - atrans, - btrans, - m, - n, - k, - A_float16.data(), - B_float16.data(), - beta, - C_float16.data()); - Float16ToFloat_ref(C_float16.data(), C_fb.data(), m * n); - - // correctness check - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - float expected = C_ref[i * n + j]; - float actual = C_fb[i * n + j]; - if (actual != 0) { - EXPECT_LE(fabs(expected - actual) / actual, k * 1.0 / 1024 * 4) - << "GEMM results differ at (" << i << ", " << j << "). ref " - << expected << " FBGemm " << actual; - } - } - } - } -} |