Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2020-04-10 08:55:34 +0300
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2020-04-10 08:59:35 +0300
commit20b6cf14e2b547987234e39d922bc22ec541c3b4 (patch)
tree5459828c84d0d8dffdfe5a5b69f86557824abd4b
parentf1d333089f9442e48af73abe73148516a3bab110 (diff)
Move FakeFP16 back to internal to remove dependency on MKL (#36297)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/36297 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/343 We moved FakeFP16 back to close source and kept `RoundToFloat16` function in "fbgemm/FbgemmConvert.h". This is because FakeFP16 introduced dependency on MKL in the FBGEMM core. Also it doesn't seem to be needed for open source, as it is not used anywhere. Reviewed By: jspark1105 Differential Revision: D20937962 fbshipit-source-id: 9487a9fd2282b6df2f754c22bea36f2255a5c791
-rw-r--r--CMakeLists.txt4
-rw-r--r--bench/BenchUtils.cc6
-rw-r--r--bench/FakeFP16Benchmark.cc315
-rw-r--r--defs.bzl8
-rw-r--r--include/fbgemm/FbgemmConvert.h11
-rw-r--r--include/fbgemm/FbgemmFakeFP16.h59
-rw-r--r--src/FbgemmFakeFP16.cc214
-rw-r--r--src/FbgemmFloat16Convert.cc34
-rw-r--r--test/CMakeLists.txt7
-rw-r--r--test/FakeFP16Test.cc169
10 files changed, 54 insertions, 773 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01185a1..ce27196 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,10 +68,10 @@ else()
endif()
# Define file lists
-get_filelist("get_fbgemm_generic_srcs(with_fp16=True)" FBGEMM_GENERIC_SRCS)
+get_filelist("get_fbgemm_generic_srcs()" FBGEMM_GENERIC_SRCS)
get_filelist("get_fbgemm_avx2_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX2_SRCS)
get_filelist("get_fbgemm_avx512_srcs(msvc=${MSVC_BOOL})" FBGEMM_AVX512_SRCS)
-get_filelist("get_fbgemm_public_headers(with_fp16=True)" FBGEMM_PUBLIC_HEADERS)
+get_filelist("get_fbgemm_public_headers()" FBGEMM_PUBLIC_HEADERS)
add_library(fbgemm_generic OBJECT ${FBGEMM_GENERIC_SRCS})
add_library(fbgemm_avx2 OBJECT ${FBGEMM_AVX2_SRCS})
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index 1b99096..1c16860 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -6,8 +6,8 @@
*/
#include "./BenchUtils.h"
-#include <cstring>
#include <algorithm>
+#include <cstring>
#include <random>
#include <type_traits>
@@ -43,8 +43,8 @@ randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
template void
randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
template void randFill<int>(aligned_vector<int>& vec, int low, int high);
-//template void
-//randFill<int64_t>(aligned_vector<int64_t>& vec, int64_t low, int64_t high);
+// template void
+// randFill<int64_t>(aligned_vector<int64_t>& vec, int64_t low, int64_t high);
template <>
void randFill(aligned_vector<int64_t>& vec, int64_t low, int64_t high) {
std::uniform_int_distribution<int64_t> dis(low, high);
diff --git a/bench/FakeFP16Benchmark.cc b/bench/FakeFP16Benchmark.cc
deleted file mode 100644
index 079d87d..0000000
--- a/bench/FakeFP16Benchmark.cc
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <chrono>
-#include <cmath>
-#include <random>
-
-#ifdef USE_MKL
-#include <mkl.h>
-#endif
-
-#ifdef USE_BLAS
-#if __APPLE__
-// not sure whether need to differentiate TARGET_OS_MAC or TARGET_OS_IPHONE,
-// etc.
-#include <Accelerate/Accelerate.h>
-#else
-#include <cblas.h>
-#endif
-#endif
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "bench/AlignedVec.h"
-#include "bench/BenchUtils.h"
-#include "fbgemm/FbgemmFakeFP16.h"
-
-#include <iomanip>
-#include <iostream>
-
-using namespace std;
-using namespace fbgemm;
-
-void performance_test() {
- // cache flush
- bool flush = true;
- std::vector<char> llc;
- if (flush) {
- llc.resize(64L * 1024L * 1024L, 1.0);
- }
-
- float alpha = 1.f, beta = 1.f;
- matrix_op_t atrans = matrix_op_t::NoTranspose;
- matrix_op_t btrans = matrix_op_t::Transpose;
-
-#define dataset 1
-
-#if dataset == 1
- const int NITER = (flush) ? 10 : 100;
- std::vector<std::vector<int>> shapes;
- for (auto m = 1; m < 120; m++) {
- // shapes.push_back({m, 128, 512});
- // shapes.push_back({m, 512, 512});
- shapes.push_back({m, 1024, 1024});
- }
-#else
- flush = false;
- constexpr int NITER = 1;
- std::vector<std::vector<int>> shapes;
- std::random_device r;
- std::default_random_engine generator(r());
- std::uniform_int_distribution<int> dm(1, 100);
- std::uniform_int_distribution<int> dnk(1, 1024);
- for (int i = 0; i < 1000; i++) {
- int m = dm(generator);
- int n = dnk(generator);
- int k = dnk(generator);
- shapes.push_back({m, n, k});
- }
-#endif
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- cout << "WARNING: the timer may be inaccurate when used by multiple threads."
- << endl;
- cout << "M, "
- << "N, "
- << "K, "
- << "Malloc (ms), "
- << "A fp16->fp32 (ms), "
- << "B fp16->fp32 (ms), "
- << "C fp16->fp32 (ms), "
- << "GEMM fp32 (ms), "
- << "C fp32->fp16 (ms), "
- << "Total (ms), "
- << "GOPS, "
- << "GB/s" << endl;
-#else
- cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(32)
- << "Type, " << setw(18) << "GOPS, " << setw(5) << "GB/s" << endl;
-#endif
-
- std::string type;
- double gflops, gbs, ttot;
- for (auto s : shapes) {
- int m = s[0];
- int n = s[1];
- int k = s[2];
-
- // initialize with small numbers
- aligned_vector<float> A_ref(m * k);
- aligned_vector<float> B_ref(k * n);
- randFill(A_ref, 0.0f, 1.0f);
- randFill(B_ref, 0.0f, 1.0f);
-
- aligned_vector<float16> A_float16(m * k);
- aligned_vector<float16> B_float16(k * n);
- aligned_vector<float16> C_float16(m * n);
-
- FloatToFloat16_ref(A_ref.data(), A_float16.data(), m * k);
- FloatToFloat16_ref(B_ref.data(), B_float16.data(), k * n);
-
- aligned_vector<float> C_ref(m * n, NAN);
- aligned_vector<float> C_fb(C_ref);
-
- if (beta != 0.0f) {
- randFill(C_ref, 0.0f, 1.0f);
- FloatToFloat16_ref(C_ref.data(), C_float16.data(), m * n);
- C_fb = C_ref;
- }
-
- double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
- double nbytes = (4.0 * (double)m * (double)k + 2.0 * (double)k * (double)n +
- 4.0 * (double)m * (double)n) *
- NITER;
-
- // warm up MKL and fbgemm
- // check correctness at the same time
- for (auto w = 0; w < 3; w++) {
-#if defined(USE_MKL) || defined(USE_BLAS)
- cblas_sgemm(
- CblasRowMajor,
- atrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans,
- btrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans,
- m,
- n,
- k,
- alpha,
- A_ref.data(),
- k,
- B_ref.data(),
- (btrans == matrix_op_t::NoTranspose) ? n : k,
- beta,
- C_ref.data(),
- n);
-#endif
-
- fbgemmFakeFP16(
- atrans,
- btrans,
- m,
- n,
- k,
- A_float16.data(),
- B_float16.data(),
- beta,
- C_float16.data());
- Float16ToFloat_ref(C_float16.data(), C_fb.data(), m * n);
-
-#if defined(USE_MKL) || defined(USE_BLAS)
- // Compare results
- for (auto i = 0; i < C_ref.size(); i++) {
- if (std::fabs(C_ref[i] - C_fb[i]) / C_ref[i] > k * 1.0 / 128) {
- fprintf(
- stderr,
- "Error: too high diff between fp32 ref %f and float16 %f\n",
- C_ref[i],
- C_fb[i]);
- return;
- }
- }
-#endif
- }
-
- chrono::time_point<chrono::system_clock> t_begin, t_end;
-#if defined(USE_MKL) || defined(USE_BLAS)
- // Gold via MKL sgemm
-#if defined(USE_MKL)
- type = "MKL_FP32";
-#else
- type = "BLAS_FP32";
-#endif
- ttot = 0;
- for (auto it = -3; it < NITER; it++) {
- if (flush) {
- for (auto i = 0; i < llc.size(); i++) {
- llc[i]++;
- }
- }
- t_begin = chrono::system_clock::now();
- cblas_sgemm(
- CblasRowMajor,
- atrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans,
- btrans == matrix_op_t::Transpose ? CblasTrans : CblasNoTrans,
- m,
- n,
- k,
- alpha,
- A_ref.data(),
- k,
- B_ref.data(),
- (btrans == matrix_op_t::NoTranspose) ? n : k,
- beta,
- C_ref.data(),
- n);
- t_end = chrono::system_clock::now();
- if (it >= 0) {
- double dt = chrono::duration<double>(t_end - t_begin).count();
- ttot += dt;
- }
- }
- gflops = nflops / ttot / 1e9;
- gbs = nbytes / ttot / 1e9;
-
- cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k << ", "
- << type.c_str() << ", " << setw(5) << fixed << setw(5)
- << setprecision(3) << gflops << ", " << setprecision(3) << gbs << endl;
- ((volatile char*)(llc.data()));
-#endif
-
- type = "FBGEMM_Float16";
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- double total_malloc_time = 0.0;
- double total_A_fp16_to_fp32_time = 0.0;
- double total_B_fp16_to_fp32_time = 0.0;
- double total_C_fp16_to_fp32_time = 0.0;
- double total_computing_time = 0.0;
- double total_C_fp32_to_fp16_time = 0.0;
- double total_run_time = 0.0;
-#endif
- cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k << ", "
- << type.c_str() << ", ";
-
- ttot = 0;
- for (auto it = -3; it < NITER; it++) {
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- malloc_time = 0.0;
- A_fp16_to_fp32_time = 0.0;
- B_fp16_to_fp32_time = 0.0;
- C_fp16_to_fp32_time = 0.0;
- computing_time = 0.0;
- C_fp32_to_fp16_time = 0.0;
- run_time = 0.0;
-#endif
-
- if (flush) {
- for (auto i = 0; i < llc.size(); i++) {
- llc[i]++;
- }
- }
-
- t_begin = chrono::system_clock::now();
- fbgemmFakeFP16(
- atrans,
- btrans,
- m,
- n,
- k,
- A_float16.data(),
- B_float16.data(),
- beta,
- C_float16.data());
- t_end = chrono::system_clock::now();
- Float16ToFloat_ref(C_float16.data(), C_fb.data(), m * n);
- if (it >= 0) {
- double dt = chrono::duration<double>(t_end - t_begin).count();
- ttot += dt;
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- total_malloc_time += malloc_time;
- total_A_fp16_to_fp32_time += A_fp16_to_fp32_time;
- total_B_fp16_to_fp32_time += B_fp16_to_fp32_time;
- total_C_fp16_to_fp32_time += C_fp16_to_fp32_time;
- total_computing_time += computing_time;
- total_C_fp32_to_fp16_time += C_fp32_to_fp16_time;
- total_run_time += run_time;
-#endif
- }
- }
- gflops = nflops / ttot / 1e9;
- gbs = nbytes / ttot / 1e9;
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- cout << fixed << total_malloc_time / (double)NITER / 1e6 << ", "
- << total_A_fp16_to_fp32_time / (double)NITER / 1e6 << ", "
- << total_B_fp16_to_fp32_time / (double)NITER / 1e6 << ", "
- << total_C_fp16_to_fp32_time / (double)NITER / 1e6 << ", "
- << total_computing_time / (double)NITER / 1e6 << ", "
- << total_C_fp32_to_fp16_time / (double)NITER / 1e6 << ", ";
-#endif
-
- cout << setw(5) << fixed << setw(5) << setprecision(3)
- << ttot / NITER * 1000 << ", " << setprecision(3) << gflops << ", "
- << setprecision(3) << gbs << endl;
- cout << endl;
-
- ((volatile char*)(llc.data()));
- }
-}
-
-int main(int /*argc*/, char** /*argv*/) {
-#ifdef _OPENMP
- // Use 1 thread unless OMP_NUM_THREADS is explicit set.
- const char* val = getenv("OMP_NUM_THREADS");
- if (val == nullptr || !*val) {
- omp_set_num_threads(1);
- }
-#endif
- performance_test();
-}
diff --git a/defs.bzl b/defs.bzl
index b56df04..f46b118 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -1,4 +1,4 @@
-def get_fbgemm_generic_srcs(with_fp16 = False):
+def get_fbgemm_generic_srcs():
return [
"src/EmbeddingSpMDM.cc",
"src/EmbeddingSpMDMNBit.cc",
@@ -32,9 +32,9 @@ def get_fbgemm_generic_srcs(with_fp16 = False):
"src/RowWiseSparseAdagradFused.cc",
"src/SparseAdagrad.cc",
"src/Utils.cc",
- ] + (["src/FbgemmFakeFP16.cc"] if with_fp16 else [])
+ ]
-def get_fbgemm_public_headers(with_fp16 = False):
+def get_fbgemm_public_headers():
return [
"include/fbgemm/Fbgemm.h",
"include/fbgemm/FbgemmBuild.h",
@@ -52,7 +52,7 @@ def get_fbgemm_public_headers(with_fp16 = False):
"include/fbgemm/ConvUtils.h",
"include/fbgemm/Types.h",
"include/fbgemm/FbgemmI8Spmdm.h",
- ] + (["include/fbgemm/FbgemmFakeFP16.h"] if with_fp16 else [])
+ ]
def get_fbgemm_avx2_srcs(msvc = False):
return [
diff --git a/include/fbgemm/FbgemmConvert.h b/include/fbgemm/FbgemmConvert.h
index 2df5004..76eda02 100644
--- a/include/fbgemm/FbgemmConvert.h
+++ b/include/fbgemm/FbgemmConvert.h
@@ -136,4 +136,15 @@ FBGEMM_API void Float16ToFloat_avx2(const float16* src, float* dst, int size);
*/
FBGEMM_API void Float16ToFloat_avx512(const float16* src, float* dst, int size);
+/**
+ * @brief Transform all entries in a matrix from fp32 to float16 and back to
+ * fp32.
+ */
+FBGEMM_API void RoundToFloat16(
+ const float* input,
+ float* output,
+ int len,
+ bool clamp = false,
+ bool clamp_denorms = false);
+
}; // namespace fbgemm
diff --git a/include/fbgemm/FbgemmFakeFP16.h b/include/fbgemm/FbgemmFakeFP16.h
deleted file mode 100644
index 3a9dc07..0000000
--- a/include/fbgemm/FbgemmFakeFP16.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include "fbgemm/FbgemmConvert.h"
-#include "fbgemm/Types.h"
-#include "fbgemm/Utils.h"
-
-// Turning on this option will print out time breakdown of each stage (e.g.,
-// pre-processing fp32 -> fp16 conversion, the main GEMM kernel, post-processing
-// fp16 -> fp32 conversion). Please note that currently this option won't report
-// accurate timing if multiple threads are used. #define
-// FBGEMM_MEASURE_TIME_BREAKDOWN
-
-// #define FBGEMM_MEASURE_TIME_BREAKDOWN
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-#include <chrono>
-#include <iostream>
-extern double malloc_time;
-extern double A_fp16_to_fp32_time;
-extern double B_fp16_to_fp32_time;
-extern double C_fp16_to_fp32_time;
-extern double computing_time;
-extern double C_fp32_to_fp16_time;
-extern double run_time;
-#endif
-
-namespace fbgemm {
-
-// typedef uint16_t float16;
-
-/**
- * @ Transform all entries in a matrix from fp32 to float16 and back to fp32
- *
- */
-FBGEMM_API void RoundToFloat16(
- const float* input,
- float* output,
- int len,
- bool clamp = false,
- bool clamp_denorms = false);
-
-FBGEMM_API void fbgemmFakeFP16(
- const matrix_op_t transa,
- const matrix_op_t transb,
- int m,
- int n,
- int k,
- const float16* A_float16,
- const float16* B_float16,
- float beta,
- float16* C_float16);
-
-}; // namespace fbgemm
diff --git a/src/FbgemmFakeFP16.cc b/src/FbgemmFakeFP16.cc
deleted file mode 100644
index 161653c..0000000
--- a/src/FbgemmFakeFP16.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include "fbgemm/FbgemmFakeFP16.h"
-#include "fbgemm/FbgemmConvert.h"
-
-#include "src/RefImplementations.h"
-
-#ifdef USE_MKL
-#include <mkl.h>
-#endif
-
-#ifdef USE_BLAS
-#include <cblas.h>
-#endif
-
-#include <cpuinfo.h>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-double malloc_time = 0.0;
-double A_fp16_to_fp32_time = 0.0;
-double B_fp16_to_fp32_time = 0.0;
-double C_fp16_to_fp32_time = 0.0;
-double computing_time = 0.0;
-double C_fp32_to_fp16_time = 0.0;
-double run_time = 0.0;
-#endif
-
-using namespace std;
-
-namespace fbgemm {
-
-void RoundToFloat16(
- const float* input,
- float* output,
- int size,
- bool clamp,
- bool clamp_denorms) {
- std::vector<fbgemm::float16> data_fp16(size);
- fbgemm::FloatToFloat16_simd(input, &(data_fp16[0]), size);
- fbgemm::Float16ToFloat_simd(&(data_fp16[0]), output, size);
-
- if (clamp) {
- // TODO: Use intrinsics to optimize clamping performance.
- for (int i = 0; i < size; ++i) {
- output[i] = std::max(std::min(output[i], 65504.0f), -65504.0f);
- }
- }
-
- if (clamp_denorms) {
- union epsilon_t {
- float f;
- uint32_t i;
- };
-
- union epsilon_t epsilon;
- epsilon.i = 0x38800000u; // 1 / 16384
-
- for (int i = 0; i < size; ++i) {
- if (std::abs(output[i]) < epsilon.f) {
- output[i] = 0.0;
- }
- }
- }
-}
-
-void fbgemmFakeFP16(
- const matrix_op_t transa,
- const matrix_op_t transb,
- int m,
- int n,
- int k,
- const float16* A_float16,
- const float16* B_float16,
- float beta,
- float16* C_float16) {
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start,
- t_start, t_end;
- double dt;
- t_start = std::chrono::high_resolution_clock::now();
- t_very_start = std::chrono::high_resolution_clock::now();
-#endif
-
- // float16 -> fp32
- float* A_fp32 =
- static_cast<float*>(fbgemmAlignedAlloc(64, m * k * sizeof(float)));
- float* B_fp32 =
- static_cast<float*>(fbgemmAlignedAlloc(64, k * n * sizeof(float)));
- float* C_fp32 =
- static_cast<float*>(fbgemmAlignedAlloc(64, m * n * sizeof(float)));
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- malloc_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
-
- int lda = (transa == matrix_op_t::NoTranspose ? k : m);
- int ldb = (transb == matrix_op_t::NoTranspose ? n : k);
- int ldc = n;
-
- // Float16ToFloat_ref(A_float16, A_fp32, m * k);
- Float16ToFloat_simd(A_float16, A_fp32, m * k);
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- A_fp16_to_fp32_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
-
- // Float16ToFloat_ref(B_float16, B_fp32, k * n);
- Float16ToFloat_simd(B_float16, B_fp32, k * n);
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- B_fp16_to_fp32_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
-
- if (beta != 0.0f) {
- // Float16ToFloat_ref(C_float16, C_fp32, m * n);
- Float16ToFloat_simd(C_float16, C_fp32, m * n);
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- C_fp16_to_fp32_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
- }
-
- // fp32 GEMM
- float alpha = 1.f;
-#if defined(USE_MKL) || defined(USE_BLAS)
- cblas_sgemm(
- CblasRowMajor,
- transa == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans,
- transb == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans,
- m,
- n,
- k,
- alpha,
- A_fp32,
- lda,
- B_fp32,
- ldb,
- beta,
- C_fp32,
- ldc);
-#else
- cblas_sgemm_ref(
- transa,
- transb,
- m,
- n,
- k,
- alpha,
- A_fp32,
- lda,
- B_fp32,
- ldb,
- beta,
- C_fp32,
- ldc);
-#endif
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- computing_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
-
- // fp32 -> float16
- // FloatToFloat16_ref(C_fp32, C_float16, m * n);
- FloatToFloat16_simd(C_fp32, C_float16, m * n);
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- C_fp32_to_fp16_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
-
- free(A_fp32);
- free(B_fp32);
- free(C_fp32);
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt =
- std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_very_start)
- .count();
- run_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
-#endif
-}
-} // namespace fbgemm
diff --git a/src/FbgemmFloat16Convert.cc b/src/FbgemmFloat16Convert.cc
index 0373e62..79e40a4 100644
--- a/src/FbgemmFloat16Convert.cc
+++ b/src/FbgemmFloat16Convert.cc
@@ -92,4 +92,38 @@ void Float16ToFloat_simd(const float16* src, float* dst, int size) {
}
}
+void RoundToFloat16(
+ const float* input,
+ float* output,
+ int size,
+ bool clamp,
+ bool clamp_denorms) {
+ std::vector<fbgemm::float16> data_fp16(size);
+ FloatToFloat16_simd(input, &(data_fp16[0]), size);
+ Float16ToFloat_simd(&(data_fp16[0]), output, size);
+
+ if (clamp) {
+ // TODO: Use intrinsics to optimize clamping performance.
+ for (int i = 0; i < size; ++i) {
+ output[i] = std::max(std::min(output[i], 65504.0f), -65504.0f);
+ }
+ }
+
+ if (clamp_denorms) {
+ union epsilon_t {
+ float f;
+ uint32_t i;
+ };
+
+ union epsilon_t epsilon;
+ epsilon.i = 0x38800000u; // 1 / 16384
+
+ for (int i = 0; i < size; ++i) {
+ if (std::abs(output[i]) < epsilon.f) {
+ output[i] = 0.0;
+ }
+ }
+ }
+}
+
} // namespace fbgemm
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b8ee9fe..336b0e8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -70,13 +70,6 @@ endmacro()
file(GLOB TEST_LIST "*Test.cc")
-# Temporary exclude Float16Test.cc on windows platform until
-# we figure out how to repro and debug the issue
-if(MSVC)
- file(GLOB BLACKLIST_MSVC "*FakeFP16Test.cc")
- list(REMOVE_ITEM TEST_LIST "${BLACKLIST_MSVC}")
-endif()
-
foreach(TEST_FILE ${TEST_LIST})
get_filename_component(TEST_NAME "${TEST_FILE}" NAME_WE)
get_filename_component(TEST_FILE_ONLY "${TEST_FILE}" NAME)
diff --git a/test/FakeFP16Test.cc b/test/FakeFP16Test.cc
deleted file mode 100644
index bccb327..0000000
--- a/test/FakeFP16Test.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <gtest/gtest.h>
-#include <cmath>
-#include <random>
-
-#ifdef USE_MKL
-#include <mkl.h>
-#endif
-
-#ifdef USE_BLAS
-#include <cblas.h>
-#endif
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "bench/BenchUtils.h"
-#include "fbgemm/FbgemmFakeFP16.h"
-#include "src/RefImplementations.h"
-
-using namespace std;
-using namespace fbgemm;
-
-namespace {
-// The template parameter is transpose of A and B
-class FBGemmFloat16Test
- : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {};
-}; // namespace
-
-INSTANTIATE_TEST_CASE_P(
- InstantiationName,
- FBGemmFloat16Test,
- ::testing::Values(
- pair<matrix_op_t, matrix_op_t>(
- matrix_op_t::NoTranspose,
- matrix_op_t::NoTranspose),
- pair<matrix_op_t, matrix_op_t>(
- matrix_op_t::NoTranspose,
- matrix_op_t::Transpose),
- pair<matrix_op_t, matrix_op_t>(
- matrix_op_t::Transpose,
- matrix_op_t::NoTranspose),
- pair<matrix_op_t, matrix_op_t>(
- matrix_op_t::Transpose,
- matrix_op_t::Transpose)));
-
-TEST_P(FBGemmFloat16Test, Test) {
-#ifdef _OPENMP
- // Use 1 thread unless OMP_NUM_THREADS is explicit set.
- const char* val = getenv("OMP_NUM_THREADS");
- if (val == nullptr || !*val) {
- omp_set_num_threads(1);
- }
-#endif
- vector<vector<int>> shapes;
- random_device r;
- default_random_engine generator(r());
- uniform_int_distribution<int> dm(1, 256);
- uniform_int_distribution<int> dnk(1, 1024);
- for (int i = 0; i < 5; i++) {
- int m = dm(generator);
- int n = dnk(generator);
- int k = dnk(generator);
- shapes.push_back({m, n, k});
- }
-
- float alpha = 1.f, beta = 0.f;
- matrix_op_t atrans, btrans;
- tie(atrans, btrans) = GetParam();
-
- for (auto s : shapes) {
- int m = s[0];
- int n = s[1];
- int k = s[2];
-
- cerr << "m = " << m << " n = " << n << " k = " << k;
- if (atrans == matrix_op_t::Transpose) {
- cerr << " A_transposed";
- }
- if (btrans == matrix_op_t::Transpose) {
- cerr << " B_transposed";
- }
- cerr << endl;
-
- // initialize with small numbers
- aligned_vector<float> A_ref(m * k);
- aligned_vector<float> B_ref(k * n);
- randFill(A_ref, 0.0f, 4.0f);
- randFill(B_ref, 0.0f, 4.0f);
- // randFill(A_ref, -1.0f, 1.0f);
- // randFill(B_ref, -1.0f, 1.0f);
-
- aligned_vector<float16> A_float16(m * k);
- aligned_vector<float16> B_float16(k * n);
- aligned_vector<float16> C_float16(m * n);
-
- aligned_vector<float> C_ref(m * n, NAN);
- aligned_vector<float> C_fb(C_ref);
-
- FloatToFloat16_ref(A_ref.data(), A_float16.data(), m * k);
- FloatToFloat16_ref(B_ref.data(), B_float16.data(), k * n);
-
- if (beta != 0.0f) {
- randFill(C_ref, 0.0f, 4.0f);
- FloatToFloat16_ref(C_ref.data(), C_float16.data(), m * n);
- C_fb = C_ref;
- }
-
- // Gold via reference sgemm
-#if defined(USE_MKL) || defined(USE_BLAS)
- cblas_sgemm(
- CblasRowMajor,
- atrans == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans,
- btrans == matrix_op_t::NoTranspose ? CblasNoTrans : CblasTrans,
- m,
- n,
- k,
- alpha,
- A_ref.data(),
- atrans == matrix_op_t::NoTranspose ? k : m,
- B_ref.data(),
- btrans == matrix_op_t::NoTranspose ? n : k,
- beta,
- C_ref.data(),
- n);
-#else
- cblas_sgemm_ref(
- atrans,
- btrans,
- m,
- n,
- k,
- alpha,
- A_ref.data(),
- atrans == matrix_op_t::NoTranspose ? k : m,
- B_ref.data(),
- btrans == matrix_op_t::NoTranspose ? n : k,
- beta,
- C_ref.data(),
- n);
-#endif
-
- // fbgemm float16
- fbgemmFakeFP16(
- atrans,
- btrans,
- m,
- n,
- k,
- A_float16.data(),
- B_float16.data(),
- beta,
- C_float16.data());
- Float16ToFloat_ref(C_float16.data(), C_fb.data(), m * n);
-
- // correctness check
- for (int i = 0; i < m; ++i) {
- for (int j = 0; j < n; ++j) {
- float expected = C_ref[i * n + j];
- float actual = C_fb[i * n + j];
- if (actual != 0) {
- EXPECT_LE(fabs(expected - actual) / actual, k * 1.0 / 1024 * 4)
- << "GEMM results differ at (" << i << ", " << j << "). ref "
- << expected << " FBGemm " << actual;
- }
- }
- }
- }
-}