Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2017-12-01 19:22:32 +0300
committerHieu Hoang <hieuhoang@gmail.com>2017-12-01 19:22:32 +0300
commit8adcc2f7db33453b6a44d95296bcaf7d445b58b3 (patch)
tree932b15d4b229ee1cfda8bad63541341acd3cdd33
parenteaa22f49b79d8ed8dfd299822758430218628ad2 (diff)
parallelize CopyNthOutBatch()
-rw-r--r--src/amun/half/mblas/matrix_functions.cu27
-rw-r--r--src/amun/half/mblas/matrix_functions.h1
2 files changed, 16 insertions, 12 deletions
diff --git a/src/amun/half/mblas/matrix_functions.cu b/src/amun/half/mblas/matrix_functions.cu
index 7cdd2ba2..bc277e69 100644
--- a/src/amun/half/mblas/matrix_functions.cu
+++ b/src/amun/half/mblas/matrix_functions.cu
@@ -1461,9 +1461,11 @@ void gCopyNthOutBatch(const VectorWrapper<NthOutBatch> nBest,
VectorWrapper<uint> outKeys,
VectorWrapper<float> outValues)
{
- for (uint i = 0; i < nBest.size(); ++i) {
- outKeys[i] = nBest[i].ind;
- outValues[i] = __half2float(nBest[i].score);
+ int id = threadIdx.x + blockIdx.x * blockDim.x;
+
+ if (id < nBest.size()) {
+ outKeys[id] = nBest[id].ind;
+ outValues[id] = __half2float(nBest[id].score);
}
}
@@ -1471,18 +1473,21 @@ void CopyNthOutBatch(const mblas::Vector<NthOutBatch> &nBest,
std::vector<uint>& outKeys,
std::vector<float>& outValues)
{
- cerr << "CopyNthOutBatch=" << nBest.size() << endl;
+ uint size = nBest.size();
//cerr << "top=" << top2.size() << " nBest=" << nBest.size() << endl;
- outKeys.resize(nBest.size());
- outValues.resize(nBest.size());
+ outKeys.resize(size);
+ outValues.resize(size);
+
+ Vector<uint> d_keys(size);
+ Vector<float> d_values(size);
- Vector<uint> d_keys(nBest.size());
- Vector<float> d_values(nBest.size());
+ uint threads = std::min((uint)MAX_THREADS, size);
+ uint blocks = (size / threads) + ((size % threads == 0) ? 0 : 1);
- gCopyNthOutBatch<<<1,1, 0, CudaStreamHandler::GetStream()>>>(nBest, d_keys, d_values);
+ gCopyNthOutBatch<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(nBest, d_keys, d_values);
- copy(d_keys.data(), nBest.size(), outKeys.data(), cudaMemcpyDeviceToHost);
- copy(d_values.data(), nBest.size(), outValues.data(), cudaMemcpyDeviceToHost);
+ copy(d_keys.data(), size, outKeys.data(), cudaMemcpyDeviceToHost);
+ copy(d_values.data(), size, outValues.data(), cudaMemcpyDeviceToHost);
}
} // namespace mblas
diff --git a/src/amun/half/mblas/matrix_functions.h b/src/amun/half/mblas/matrix_functions.h
index 16b20464..586ee8a3 100644
--- a/src/amun/half/mblas/matrix_functions.h
+++ b/src/amun/half/mblas/matrix_functions.h
@@ -108,7 +108,6 @@ void Copy(const T1 *in, uint size, T2 *out, cudaMemcpyKind kind)
uint threads = std::min((uint)MAX_THREADS, size);
uint blocks = (size / threads) + ((size % threads == 0) ? 0 : 1);
- std::cerr << "Copy1=" << size << std::endl;
if (kind == cudaMemcpyDeviceToHost) {
const VectorWrapper<T1> inWrap(in, size);