From cf5a42d1e70954a13eb11145bc2b8801ba6dc21d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 16 Jan 2018 12:15:16 +0000 Subject: use number of active batches for block size for gNBestPerBatch() --- src/amun/gpu/mblas/matrix_functions.cu | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/amun/gpu/mblas/matrix_functions.cu b/src/amun/gpu/mblas/matrix_functions.cu index 457039f3..75ab0dcd 100644 --- a/src/amun/gpu/mblas/matrix_functions.cu +++ b/src/amun/gpu/mblas/matrix_functions.cu @@ -1330,13 +1330,14 @@ void LogSoftmaxAndNBest(mblas::Vector &nBest, cerr << "beamSizeSum=" << beamSizeSum << endl; cerr << "batchSize=" << batchSize << endl; cerr << "candidateInd=" << candidateInd << endl; - cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 0) << endl; - cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 0) << endl; - cerr << "batch2Hypo=" << Debug(batch2Hypo, 0) << endl; - cerr << "nBest=" << Debug(nBest, 0) << endl; - cerr << "nBestCandidates=" << Debug(nBestCandidates, 0) << endl; + cerr << "hypo2BeamSize=" << hypo2BeamSize.Debug(0) << endl; + cerr << "hypo2Candidate=" << hypo2Candidate.Debug(0) << endl; + cerr << "batch2Hypo=" << batch2Hypo.Debug(0) << endl; + cerr << "nBest=" << nBest.Debug(0) << endl; + cerr << "nBestCandidates=" << nBestCandidates.Debug(0) << endl; cerr << endl; */ + MatrixWrapper inWrap(in); MatrixWrapper b4Wrap(b4); VectorWrapper hypo2BeamSizeWrap(hypo2BeamSize); @@ -1349,12 +1350,6 @@ void LogSoftmaxAndNBest(mblas::Vector &nBest, VectorWrapper beamSizesWrap(d_beamSizes); //PAUSE_TIMER("LogSoftmax excl kernels"); - - int blocks = std::min(MAX_BLOCKS, (int)in.dim(0)); - int threads = std::min(MAX_THREADS, (int)in.dim(1)); - int shared = sizeof(NthOutBatch) * threads * maxBeamSize - + sizeof(float) * threads; - //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); //BEGIN_TIMER("gBeamSizeInit"); @@ -1376,6 +1371,11 @@ void LogSoftmaxAndNBest(mblas::Vector &nBest, */ //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); + unsigned blocks = std::min((unsigned) MAX_BLOCKS, in.dim(0)); + unsigned threads = std::min((unsigned)MAX_THREADS, in.dim(1)); + unsigned shared = sizeof(NthOutBatch) * threads * maxBeamSize + + sizeof(float) * threads; + //BEGIN_TIMER("gLogSoftMax"); gLogSoftMax<<>> (nBestCandidatesWrap, @@ -1388,7 +1388,8 @@ void LogSoftmaxAndNBest(mblas::Vector &nBest, //PAUSE_TIMER("gLogSoftMax"); //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); - + blocks = std::min((unsigned) MAX_BLOCKS, batchSize); + //BEGIN_TIMER("gNBestPerBatch"); gNBestPerBatch<<>> (nBestWrap, -- cgit v1.2.3