optimization

author: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2016-04-26 17:56:05 +0300
committer: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2016-04-26 17:56:05 +0300
commit: 21e7c774c3df94dadb6623481802cc28e8f4f739 (patch)
tree: da39aa9107c6016481e21004fd3c51f95a927592
parent: 8dd0b6d3a877291e1269a4d85408695d1be4b41c (diff)
2 files changed, 17 insertions, 10 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1effd778..6009bcc4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1)
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 project(amunn CXX)
-SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O0 -funroll-loops -Wno-unused-result -Wno-deprecated")
-LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O0; -arch=sm_35; -lineinfo; --use_fast_math;)
+SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
+LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math;)
 add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
 SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
diff --git a/src/decoder/search.h b/src/decoder/search.h
index be4b2ad8..e05a284c 100644
--- a/src/decoder/search.h
+++ b/src/decoder/search.h
@@ -12,6 +12,12 @@ class Search {
     std::vector<ScorerPtr> scorers_;
   
     using Matrix = typename Backend::Payload;
+    
+    template <typename T>
+    using DeviceVector = typename Backend::DeviceVector<T>;
+    
+    template <typename T>
+    using HostVector = typename Backend::HostVector<T>;
   
   public:
     Search(size_t threadId)
@@ -90,30 +96,31 @@ class Search {
       
       Matrix& probs = probsEnsemble[0];
       
-      Matrix costs(probs.Rows(), 1);
+      Matrix costs;
+      (*costs).Resize((*probs).Rows(), 1);
       HostVector<float> vCosts;
       for(auto& h : prevHyps)
         vCosts.push_back(h->GetCost());
       Backend::copy(vCosts.begin(), vCosts.end(), costs.begin());
       
-      Backend::BroadcastVecColumn(weights[0] * Backend::_1 + Backend::_2,
-                                  probs, costs);
+      Backend::Broadcast(weights[0] * Backend::_1 + Backend::_2,
+                         probs, costs);
       for(size_t i = 0; i < probsEnsemble.size(); ++i)
         Backend::Element(Backend::_1 + weights[i] * Backend::_2,
                          probs, probsEnsemble[i]);
       
-      Backend::HostVector<unsigned> bestKeys(beamSize);
-      Backend::HostVector<float> bestCosts(beamSize);
+      HostVector<unsigned> bestKeys(beamSize);
+      HostVector<float> bestCosts(beamSize);
       
       Backend::PartialSortByKey(probs, bestKeys, bestCosts);
       
-      std::vector<Backend::HostVector<float>> breakDowns;
+      std::vector<HostVector<float>> breakDowns;
       bool doBreakdown = God::Get<bool>("n-best");
       if(doBreakdown) {
         breakDowns.push_back(bestCosts);
         for(size_t i = 1; i < probsEnsemble.size(); ++i) {
           HostVector<float> modelCosts(beamSize);
-          auto it = Backend::make_permutation_iterator(probsEnsemble[i].begin(), keys.begin());
+          auto it = Backend::make_permutation_iterator(probsEnsemble[i].begin(), bestKeys.begin());
           Backend::copy(it, it + beamSize, modelCosts.begin());
           breakDowns.push_back(modelCosts);
         }
@@ -136,7 +143,7 @@ class Search {
               float cost = 0;
               if(j < probsEnsemble.size()) {
                 if(prevHyps[hypIndex]->GetCostBreakdown().size() < probsEnsemble.size())
-                  const_cast<HypothesisPtr&>(prevHyps[hypIndex])->GetCostBreakdown().resize(ProbsEnsemble.size(), 0.0);
+                  const_cast<HypothesisPtr&>(prevHyps[hypIndex])->GetCostBreakdown().resize(probsEnsemble.size(), 0.0);
                 cost = breakDowns[j][i] + const_cast<HypothesisPtr&>(prevHyps[hypIndex])->GetCostBreakdown()[j];
               }
               sum += weights[j] * cost;
author	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2016-04-26 17:56:05 +0300
committer	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2016-04-26 17:56:05 +0300
commit	21e7c774c3df94dadb6623481802cc28e8f4f739 (patch)
tree	da39aa9107c6016481e21004fd3c51f95a927592
parent	8dd0b6d3a877291e1269a4d85408695d1be4b41c (diff)