Fixing sparse linear race condition

author: Zeming Lin <ebetica0@gmail.com> 2016-05-10 11:30:09 +0300
committer: Zeming Lin <ebetica0@gmail.com> 2016-05-14 00:28:26 +0300
commit: a20aa1ee3aab75172a5697560e01f61af94c3293 (patch)
tree: 2458a47ca580fe0c57009fdfe76e78096b8ac787
parent: 17aca108abcf50830f624f8dfd8d8928c3241450 (diff)
3 files changed, 92 insertions, 46 deletions
diff --git a/SparseLinear.lua b/SparseLinear.lua
index 6595be9..7c3edad 100644
--- a/SparseLinear.lua
+++ b/SparseLinear.lua
@@ -130,8 +130,13 @@ function SparseLinear:accGradParameters(input, gradOutput, scale)
          gradOutput:resize(1, gradOutput:size(1))
       end
 
+      local rows = self.formatted_input:select(2, 1)
+      local cols = self.formatted_input:select(2, 2)
+      local sortinds = cols * gradOutput:size(1) + rows
+      local _, inds = sortinds:sort(1, false)
+      local newinput = self.formatted_input:index(1, inds)
       input[1].THNN.SparseLinear_accGradParameters(
-         self.formatted_input:cdata(),
+         newinput:cdata(),
          gradOutput:cdata(),
          self.gradWeight:cdata(),
          self.gradBias:cdata(),
diff --git a/lib/THNN/generic/SparseLinear.c b/lib/THNN/generic/SparseLinear.c
index 0f426ba..b7bf8ab 100644
--- a/lib/THNN/generic/SparseLinear.c
+++ b/lib/THNN/generic/SparseLinear.c
@@ -48,7 +48,7 @@ void THNN_(SparseLinear_updateOutput)(
           THTensor *weight,
           THTensor *bias)
 {
-  long h, i;
+  long h, i, j, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
   long batchSize = THTensor_(size)(output, 0);
@@ -59,25 +59,43 @@ void THNN_(SparseLinear_updateOutput)(
 
   long nnz = THTensor_(size)(input, 0);
 
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
   // output = weight * input + bias
   THTensor_(zero)(output);
-#pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000)
-  for (i = 0; i < nnz; i++) {
-    real val = THNN_(get2d)(input, i, 2);
-    if (val == 0) {
-      continue;
-    }
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    long i_start = THLongTensor_get1d(csr, h);
+    long i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
 
-    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
-    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
-    if (offset >= 0 && offset < inDim) {
-      THBlas_(axpy)(outDim,
-                    val,
-                    COL_PTR2(weight, offset), weight->stride[0],
-                    ROW_PTR2(output, h), output->stride[1]);
-    } else {
-      THError("index out of bound. updateOutput: %d not between 1 and %d",
-              offset + 1, inDim);
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
     }
   }
 
@@ -151,7 +169,7 @@ void THNN_(SparseLinear_accGradParameters)(
           real weightDecay,
           real scale)
 {
-  long h, i;
+  long h, i, col, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
 
@@ -165,26 +183,42 @@ void THNN_(SparseLinear_accGradParameters)(
              "gradOutput must be contiguous");
 
   long nnz = THTensor_(size)(input, 0);
-  // THTensor_(resize2d)(gradOutput, batchSize, outDim);
 
-  // gradWeight += gradOutput * input
-#pragma omp parallel for private(h, i) schedule(static) if (\
-  nnz * outDim > 10000)
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
   for (i = 0; i < nnz; i++) {
-    real val = scale * THNN_(get2d)(input, i, 2);
+    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
 
-    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
-    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
-    if (offset >= 0 && offset < inDim) {
-      THBlas_(axpy)(outDim,
-          val,
-          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
-          COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
-    } else {
-      THError(
-          "index out of bound. accGradParameters: %d not between 1 and %d",
-          offset + 1,
-          inDim);
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    long i_start = THLongTensor_get1d(csc, col);
+    long i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
+
+      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
     }
   }
 
diff --git a/test.lua b/test.lua
index 3847166..63b066f 100644
--- a/test.lua
+++ b/test.lua
@@ -805,12 +805,7 @@ function nntest.Linear()
    end  -- for ind, inj in pairs(inj_vals) do
 end
 
-function nntest.SparseLinear()
-   local inb = math.random(5,10)
-   local ini = math.random(50,100)
-   local inj = math.random(5,10)
-   local numNonzero = math.random(3,5)
-
+local function test_sparse_linear(inb, ini, inj, numNonzero)
    local module = nn.SparseLinear(ini,inj, true)
    local linear = nn.Linear(ini, inj)
    linear.weight = module.weight:clone()
@@ -822,11 +817,11 @@ function nntest.SparseLinear()
    local input = {}
    local nonsparse = torch.zeros(inb, ini)
    for i=1,inb do
-       local nnz = math.random(1, 3)
+       local nnz = math.random(1, 3) + numNonzero
        local inds = torch.randperm(ini)[{{1,nnz}}]
        input[i] = torch.Tensor(nnz, 2)
        input[i]:select(2,1):copy(inds)
-       input[i]:select(2,2):copy(torch.ones(nnz))
+       input[i]:select(2,2):copy(torch.rand(nnz))
        nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
    end
    local gradOutput = torch.rand(inb, inj)
@@ -872,8 +867,8 @@ function nntest.SparseLinear()
       mytester:assertle(gierr, precision, 'error on gradInput with ntimes = '..ntimes)
 
       for _,var in ipairs(cmps) do
-           local err = (module[var] - linear[var]):abs():max()
-           mytester:assertle(err, precision, 'error on '..var..' with ntimes='..ntimes)
+          local err = (module[var] - linear[var]):abs():max()
+          mytester:assertle(err, precision, 'error on '..var..' with ntimes = '..ntimes)
       end
 
       module:zeroGradParameters()
@@ -913,6 +908,18 @@ function nntest.SparseLinear()
    mytester:assertle(err, precision, 'error on batch result forward')
 end
 
+function nntest.SparseLinear()
+   local inb = math.random(5,10)
+   local ini = math.random(50,100)
+   local inj = math.random(5,10)
+   local numNonzero = math.random(3,5)
+
+   test_sparse_linear(inb, ini, inj, numNonzero)
+   -- Tests OMP parallelism
+   test_sparse_linear(1, 50000, 10, 20000)
+   test_sparse_linear(1000, 1000, 10, 100)
+end
+
 function nntest.Bilinear()
 
    -- set up data:
author	Zeming Lin <ebetica0@gmail.com>	2016-05-10 11:30:09 +0300
committer	Zeming Lin <ebetica0@gmail.com>	2016-05-14 00:28:26 +0300
commit	a20aa1ee3aab75172a5697560e01f61af94c3293 (patch)
tree	2458a47ca580fe0c57009fdfe76e78096b8ac787
parent	17aca108abcf50830f624f8dfd8d8928c3241450 (diff)