diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2017-12-01 19:22:32 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2017-12-01 19:22:32 +0300 |
commit | 8adcc2f7db33453b6a44d95296bcaf7d445b58b3 (patch) | |
tree | 932b15d4b229ee1cfda8bad63541341acd3cdd33 | |
parent | eaa22f49b79d8ed8dfd299822758430218628ad2 (diff) |
parallelize CopyNthOutBatch()
-rw-r--r-- | src/amun/half/mblas/matrix_functions.cu | 27 | ||||
-rw-r--r-- | src/amun/half/mblas/matrix_functions.h | 1 |
2 files changed, 16 insertions, 12 deletions
diff --git a/src/amun/half/mblas/matrix_functions.cu b/src/amun/half/mblas/matrix_functions.cu index 7cdd2ba2..bc277e69 100644 --- a/src/amun/half/mblas/matrix_functions.cu +++ b/src/amun/half/mblas/matrix_functions.cu @@ -1461,9 +1461,11 @@ void gCopyNthOutBatch(const VectorWrapper<NthOutBatch> nBest, VectorWrapper<uint> outKeys, VectorWrapper<float> outValues) { - for (uint i = 0; i < nBest.size(); ++i) { - outKeys[i] = nBest[i].ind; - outValues[i] = __half2float(nBest[i].score); + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if (id < nBest.size()) { + outKeys[id] = nBest[id].ind; + outValues[id] = __half2float(nBest[id].score); } } @@ -1471,18 +1473,21 @@ void CopyNthOutBatch(const mblas::Vector<NthOutBatch> &nBest, std::vector<uint>& outKeys, std::vector<float>& outValues) { - cerr << "CopyNthOutBatch=" << nBest.size() << endl; + uint size = nBest.size(); //cerr << "top=" << top2.size() << " nBest=" << nBest.size() << endl; - outKeys.resize(nBest.size()); - outValues.resize(nBest.size()); + outKeys.resize(size); + outValues.resize(size); + + Vector<uint> d_keys(size); + Vector<float> d_values(size); - Vector<uint> d_keys(nBest.size()); - Vector<float> d_values(nBest.size()); + uint threads = std::min((uint)MAX_THREADS, size); + uint blocks = (size / threads) + ((size % threads == 0) ? 0 : 1); - gCopyNthOutBatch<<<1,1, 0, CudaStreamHandler::GetStream()>>>(nBest, d_keys, d_values); + gCopyNthOutBatch<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(nBest, d_keys, d_values); - copy(d_keys.data(), nBest.size(), outKeys.data(), cudaMemcpyDeviceToHost); - copy(d_values.data(), nBest.size(), outValues.data(), cudaMemcpyDeviceToHost); + copy(d_keys.data(), size, outKeys.data(), cudaMemcpyDeviceToHost); + copy(d_values.data(), size, outValues.data(), cudaMemcpyDeviceToHost); } } // namespace mblas diff --git a/src/amun/half/mblas/matrix_functions.h b/src/amun/half/mblas/matrix_functions.h index 16b20464..586ee8a3 100644 --- a/src/amun/half/mblas/matrix_functions.h +++ b/src/amun/half/mblas/matrix_functions.h @@ -108,7 +108,6 @@ void Copy(const T1 *in, uint size, T2 *out, cudaMemcpyKind kind) uint threads = std::min((uint)MAX_THREADS, size); uint blocks = (size / threads) + ((size % threads == 0) ? 0 : 1); - std::cerr << "Copy1=" << size << std::endl; if (kind == cudaMemcpyDeviceToHost) { const VectorWrapper<T1> inWrap(in, size); |