diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-01-16 19:25:36 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2018-01-16 19:25:36 +0300 |
commit | 8003afcdac90fbd508b19b078e33ed793d97c020 (patch) | |
tree | 0008d49b62937bc4442811ed5ca16b9359222b9f | |
parent | ed9ca006fd2dd4427b9b4bb884e9bc50846f6c8b (diff) | |
parent | cf5a42d1e70954a13eb11145bc2b8801ba6dc21d (diff) |
Merge branch 'master' of https://github.com/hieuhoang/marian
-rw-r--r-- | src/amun/gpu/mblas/matrix_functions.cu | 25 |
1 files changed, 13 insertions, 12 deletions
diff --git a/src/amun/gpu/mblas/matrix_functions.cu b/src/amun/gpu/mblas/matrix_functions.cu index 457039f3..75ab0dcd 100644 --- a/src/amun/gpu/mblas/matrix_functions.cu +++ b/src/amun/gpu/mblas/matrix_functions.cu @@ -1330,13 +1330,14 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, cerr << "beamSizeSum=" << beamSizeSum << endl; cerr << "batchSize=" << batchSize << endl; cerr << "candidateInd=" << candidateInd << endl; - cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 0) << endl; - cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 0) << endl; - cerr << "batch2Hypo=" << Debug(batch2Hypo, 0) << endl; - cerr << "nBest=" << Debug(nBest, 0) << endl; - cerr << "nBestCandidates=" << Debug(nBestCandidates, 0) << endl; + cerr << "hypo2BeamSize=" << hypo2BeamSize.Debug(0) << endl; + cerr << "hypo2Candidate=" << hypo2Candidate.Debug(0) << endl; + cerr << "batch2Hypo=" << batch2Hypo.Debug(0) << endl; + cerr << "nBest=" << nBest.Debug(0) << endl; + cerr << "nBestCandidates=" << nBestCandidates.Debug(0) << endl; cerr << endl; */ + MatrixWrapper<float> inWrap(in); MatrixWrapper<float> b4Wrap(b4); VectorWrapper<unsigned> hypo2BeamSizeWrap(hypo2BeamSize); @@ -1349,12 +1350,6 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, VectorWrapper<unsigned> beamSizesWrap(d_beamSizes); //PAUSE_TIMER("LogSoftmax excl kernels"); - - int blocks = std::min(MAX_BLOCKS, (int)in.dim(0)); - int threads = std::min(MAX_THREADS, (int)in.dim(1)); - int shared = sizeof(NthOutBatch) * threads * maxBeamSize - + sizeof(float) * threads; - //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); //BEGIN_TIMER("gBeamSizeInit"); @@ -1376,6 +1371,11 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, */ //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); + unsigned blocks = std::min((unsigned) MAX_BLOCKS, in.dim(0)); + unsigned threads = std::min((unsigned)MAX_THREADS, in.dim(1)); + unsigned shared = sizeof(NthOutBatch) * threads * maxBeamSize + + sizeof(float) * threads; + //BEGIN_TIMER("gLogSoftMax"); gLogSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>> (nBestCandidatesWrap, @@ -1388,7 +1388,8 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, //PAUSE_TIMER("gLogSoftMax"); //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); - + blocks = std::min((unsigned) MAX_BLOCKS, batchSize); + //BEGIN_TIMER("gNBestPerBatch"); gNBestPerBatch<<<blocks, 1, 0, CudaStreamHandler::GetStream()>>> (nBestWrap, |