add dummy compute to cache evict func (#337)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/337 https://github.com/pytorch/FBGEMM/issues/333 Reviewed By: jianyuh Differential Revision: D20799659 fbshipit-source-id: 6b38301b39423806cb4a5087c0925116c336854f
author: Jongsoo Park <jongsoo@fb.com> 2020-04-03 20:08:17 +0300
committer: Jongsoo Park <jongsoo@fb.com> 2020-04-03 22:15:00 +0300
commit: 1bda6aec6d3f85071fb77a2786848a5b5ed11ca5 (patch)
tree: e7cff01ea9d87850aefb10b6910183f287f2eef3
parent: 1798e5643532e7006250dcbfcfb38627477f3d24 (diff)
1 files changed, 13 insertions, 1 deletions
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 2b63068..64dafd3 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -15,6 +15,7 @@
 #include <omp.h>
 #endif
 #include "./AlignedVec.h"
+#include "fbgemm/FbgemmBuild.h"
 
 namespace fbgemm {
 
@@ -31,16 +32,27 @@ int fbgemm_get_num_threads();
 int fbgemm_get_thread_num();
 
 template <typename T>
-void cache_evict(const T& vec) {
+NOINLINE
+float cache_evict(const T& vec) {
   auto const size = vec.size();
   auto const elemSize = sizeof(typename T::value_type);
   auto const dataSize = size * elemSize;
 
   const char* data = reinterpret_cast<const char*>(vec.data());
   constexpr int CACHE_LINE_SIZE = 64;
+  // Not having this dummy computation significantly slows down the computation
+  // that follows.
+  float dummy = 0.0f;
   for (std::size_t i = 0; i < dataSize; i += CACHE_LINE_SIZE) {
+    dummy += data[i] * 1.0f;
+    _mm_mfence();
+#ifndef _MSC_VER
+    asm volatile("" ::: "memory");
+#endif
     _mm_clflush(&data[i]);
   }
+
+  return dummy;
 }
 
 /**
author	Jongsoo Park <jongsoo@fb.com>	2020-04-03 20:08:17 +0300
committer	Jongsoo Park <jongsoo@fb.com>	2020-04-03 22:15:00 +0300
commit	1bda6aec6d3f85071fb77a2786848a5b5ed11ca5 (patch)
tree	e7cff01ea9d87850aefb10b6910183f287f2eef3
parent	1798e5643532e7006250dcbfcfb38627477f3d24 (diff)