From cf5a42d1e70954a13eb11145bc2b8801ba6dc21d Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 16 Jan 2018 12:15:16 +0000
Subject: use number of active batches for block size for  gNBestPerBatch()

---
 src/amun/gpu/mblas/matrix_functions.cu | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/src/amun/gpu/mblas/matrix_functions.cu b/src/amun/gpu/mblas/matrix_functions.cu
index 457039f3..75ab0dcd 100644
--- a/src/amun/gpu/mblas/matrix_functions.cu
+++ b/src/amun/gpu/mblas/matrix_functions.cu
@@ -1330,13 +1330,14 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
   cerr << "beamSizeSum=" << beamSizeSum << endl;
   cerr << "batchSize=" << batchSize << endl;
   cerr << "candidateInd=" << candidateInd << endl;
-  cerr << "hypo2BeamSize=" << Debug(hypo2BeamSize, 0) << endl;
-  cerr << "hypo2Candidate=" << Debug(hypo2Candidate, 0) << endl;
-  cerr << "batch2Hypo=" << Debug(batch2Hypo, 0) << endl;
-  cerr << "nBest=" << Debug(nBest, 0) << endl;
-  cerr << "nBestCandidates=" << Debug(nBestCandidates, 0) << endl;
+  cerr << "hypo2BeamSize=" << hypo2BeamSize.Debug(0) << endl;
+  cerr << "hypo2Candidate=" << hypo2Candidate.Debug(0) << endl;
+  cerr << "batch2Hypo=" << batch2Hypo.Debug(0) << endl;
+  cerr << "nBest=" << nBest.Debug(0) << endl;
+  cerr << "nBestCandidates=" << nBestCandidates.Debug(0) << endl;
   cerr << endl;
   */
+
   MatrixWrapper<float> inWrap(in);
   MatrixWrapper<float> b4Wrap(b4);
   VectorWrapper<unsigned> hypo2BeamSizeWrap(hypo2BeamSize);
@@ -1349,12 +1350,6 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
   VectorWrapper<unsigned> beamSizesWrap(d_beamSizes);
 
   //PAUSE_TIMER("LogSoftmax excl kernels");
-
-  int blocks = std::min(MAX_BLOCKS, (int)in.dim(0));
-  int threads = std::min(MAX_THREADS, (int)in.dim(1));
-  int shared = sizeof(NthOutBatch) * threads * maxBeamSize
-             + sizeof(float) * threads;
-
   //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
       
   //BEGIN_TIMER("gBeamSizeInit");
@@ -1376,6 +1371,11 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
   */
   //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
     
+  unsigned blocks = std::min((unsigned) MAX_BLOCKS, in.dim(0));
+  unsigned threads = std::min((unsigned)MAX_THREADS, in.dim(1));
+  unsigned shared = sizeof(NthOutBatch) * threads * maxBeamSize
+             + sizeof(float) * threads;
+
   //BEGIN_TIMER("gLogSoftMax");
   gLogSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
     (nBestCandidatesWrap,
@@ -1388,7 +1388,8 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
   //PAUSE_TIMER("gLogSoftMax");
   
   //HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
-  
+  blocks = std::min((unsigned) MAX_BLOCKS, batchSize);
+
   //BEGIN_TIMER("gNBestPerBatch");
   gNBestPerBatch<<<blocks, 1, 0, CudaStreamHandler::GetStream()>>>
     (nBestWrap,
-- 
cgit v1.2.3