parallelize CopyNthOutBatch()

author: Hieu Hoang <hieuhoang@gmail.com> 2017-12-01 19:22:32 +0300
committer: Hieu Hoang <hieuhoang@gmail.com> 2017-12-01 19:22:32 +0300
commit: 8adcc2f7db33453b6a44d95296bcaf7d445b58b3 (patch)
tree: 932b15d4b229ee1cfda8bad63541341acd3cdd33
parent: eaa22f49b79d8ed8dfd299822758430218628ad2 (diff)
2 files changed, 16 insertions, 12 deletions
diff --git a/src/amun/half/mblas/matrix_functions.cu b/src/amun/half/mblas/matrix_functions.cu
index 7cdd2ba2..bc277e69 100644
--- a/src/amun/half/mblas/matrix_functions.cu
+++ b/src/amun/half/mblas/matrix_functions.cu
@@ -1461,9 +1461,11 @@ void gCopyNthOutBatch(const VectorWrapper<NthOutBatch> nBest,
                       VectorWrapper<uint> outKeys,
                       VectorWrapper<float> outValues)
 {
-  for (uint i = 0; i < nBest.size(); ++i) {
-    outKeys[i] = nBest[i].ind;
-    outValues[i] = __half2float(nBest[i].score);
+  int id = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (id < nBest.size()) {
+    outKeys[id] = nBest[id].ind;
+    outValues[id] = __half2float(nBest[id].score);
   }
 }
 
@@ -1471,18 +1473,21 @@ void CopyNthOutBatch(const mblas::Vector<NthOutBatch> &nBest,
               std::vector<uint>& outKeys,
               std::vector<float>& outValues)
 {
-  cerr << "CopyNthOutBatch=" << nBest.size() << endl;
+  uint size = nBest.size();
   //cerr << "top=" << top2.size() << " nBest=" << nBest.size() << endl;
-  outKeys.resize(nBest.size());
-  outValues.resize(nBest.size());
+  outKeys.resize(size);
+  outValues.resize(size);
+
+  Vector<uint> d_keys(size);
+  Vector<float> d_values(size);
 
-  Vector<uint> d_keys(nBest.size());
-  Vector<float> d_values(nBest.size());
+  uint threads = std::min((uint)MAX_THREADS, size);
+  uint blocks =  (size / threads) + ((size % threads == 0) ?  0 : 1);
 
-  gCopyNthOutBatch<<<1,1, 0, CudaStreamHandler::GetStream()>>>(nBest, d_keys, d_values);
+  gCopyNthOutBatch<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(nBest, d_keys, d_values);
 
-  copy(d_keys.data(), nBest.size(), outKeys.data(), cudaMemcpyDeviceToHost);
-  copy(d_values.data(), nBest.size(), outValues.data(), cudaMemcpyDeviceToHost);
+  copy(d_keys.data(), size, outKeys.data(), cudaMemcpyDeviceToHost);
+  copy(d_values.data(), size, outValues.data(), cudaMemcpyDeviceToHost);
 }
 
 }  // namespace mblas
diff --git a/src/amun/half/mblas/matrix_functions.h b/src/amun/half/mblas/matrix_functions.h
index 16b20464..586ee8a3 100644
--- a/src/amun/half/mblas/matrix_functions.h
+++ b/src/amun/half/mblas/matrix_functions.h
@@ -108,7 +108,6 @@ void Copy(const T1 *in, uint size, T2 *out,  cudaMemcpyKind kind)
   uint threads = std::min((uint)MAX_THREADS, size);
   uint blocks =  (size / threads) + ((size % threads == 0) ?  0 : 1);
 
-  std::cerr << "Copy1=" << size << std::endl;
   if (kind == cudaMemcpyDeviceToHost) {
     const VectorWrapper<T1> inWrap(in, size);
author	Hieu Hoang <hieuhoang@gmail.com>	2017-12-01 19:22:32 +0300
committer	Hieu Hoang <hieuhoang@gmail.com>	2017-12-01 19:22:32 +0300
commit	8adcc2f7db33453b6a44d95296bcaf7d445b58b3 (patch)
tree	932b15d4b229ee1cfda8bad63541341acd3cdd33
parent	eaa22f49b79d8ed8dfd299822758430218628ad2 (diff)