Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/nn.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsoumith <soumith@fb.com>2015-01-03 07:42:12 +0300
committersoumith <soumith@fb.com>2015-01-03 07:42:12 +0300
commita38407a57def785acc819066db70f1649da47f03 (patch)
tree9aaa885fb28188a7c17fca6bcfe9e527f3930904
parent2340b9c068b518cdc20b0c6c1a9b68971f0e97e8 (diff)
speedup and optimizations for SparseLinear
-rw-r--r--SparseLinear.lua37
-rw-r--r--generic/SparseLinear.c249
2 files changed, 226 insertions, 60 deletions
diff --git a/SparseLinear.lua b/SparseLinear.lua
index 735d0ed..ca15be6 100644
--- a/SparseLinear.lua
+++ b/SparseLinear.lua
@@ -4,11 +4,16 @@ function SparseLinear:__init(inputSize, outputSize)
parent.__init(self)
self.weightDecay = 0
- self.weight = torch.Tensor(outputSize, inputSize)
- self.bias = torch.Tensor(outputSize)
- self.gradWeight = torch.Tensor(outputSize, inputSize)
- self.gradBias = torch.Tensor(outputSize)
- self.lastInput = torch.Tensor()
+ self.weight = torch.Tensor(outputSize, inputSize):zero()
+ self.bias = torch.Tensor(outputSize):zero()
+ self.gradWeight = torch.Tensor(outputSize, inputSize):zero()
+ self.gradBias = torch.Tensor(outputSize):zero()
+ self.lastInput = nil
+
+ if torch.getnumthreads() > 1 and outputSize >= 128 then
+ self.shardBuffer = torch.Tensor(outputSize, torch.getnumthreads())
+ end
+
-- state
self.gradInput:resize(inputSize)
self.output:resize(outputSize)
@@ -20,7 +25,7 @@ function SparseLinear:reset(stdv)
if stdv then
stdv = stdv * math.sqrt(3)
else
- stdv = 1./math.sqrt(self.weight:size(1))
+ stdv = 1./math.sqrt(self.weight:size(2))
end
if nn.oldSeed then
for i=1,self.weight:size(1) do
@@ -40,22 +45,18 @@ function SparseLinear:updateOutput(input)
end
function SparseLinear:accGradParameters(input, gradOutput, scale)
+ if not self.lastInput then
+ self.lastInput = input:clone()
+ else
+ self.lastInput:resizeAs(input):copy(input)
+ end
+
return input.nn.SparseLinear_accGradParameters(self, input, gradOutput, scale)
end
function SparseLinear:updateGradInput(input, gradOutput)
if self.gradInput then
- self.gradInput:resize(input:size())
- self.gradInput:copy(input)
- local numNonzero = self.gradInput:size(1)
- for e=1,numNonzero do
- local g = 0
- local i = self.gradInput[{e,1}]
- for j=1,self.output:size(1) do
- g = g + self.weight[{j,i}] * gradOutput[j]
- end
- self.gradInput[{e,2}] = g
- end
+ input.nn.SparseLinear_updateGradInput(self, input, gradOutput)
return self.gradInput
end
-end \ No newline at end of file
+end
diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index f39791b..b3ccbf1 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -2,6 +2,18 @@
#define TH_GENERIC_FILE "generic/SparseLinear.c"
#else
+static int nn_(checkInput)(THTensor* t) {
+ return t->nDimension == 2 && t->size[1] == 2;
+}
+
+static int nn_(checkSize2D)(THTensor* t, long size0, long size1) {
+ return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static int nn_(checkSize1D)(THTensor* t, long size0) {
+ return t->nDimension == 1 && t->size[0] == size0;
+}
+
static int nn_(SparseLinear_updateOutput)(lua_State *L)
{
long i;
@@ -9,27 +21,72 @@ static int nn_(SparseLinear_updateOutput)(lua_State *L)
THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
- long dim = weight->size[1]; /* number of weights.. */
+
+ long outDim = weight->size[0];
+ long inDim = weight->size[1];
+
+ luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
+ luaL_argcheck(L, nn_(checkSize1D)(output, outDim), 1, "output size wrong");
+ luaL_argcheck(L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+
+ lua_getfield(L, 1, "shardBuffer");
+ if (!lua_isnil(L, -1)) {
+ THTensor *buffer =
+ luaT_getfieldcheckudata(L, 1, "shardBuffer", torch_Tensor);
+ long num_shards = buffer->size[1];
+ luaL_argcheck(L,
+ buffer->nDimension == 2 && buffer->size[0] == outDim &&
+ num_shards > 0,
+ 1,
+ "shardBuffer size wrong");
+
+ THTensor_(zero)(buffer);
+ #pragma omp parallel for private(i) schedule(static) num_threads(num_shards)
+ for (i = 0; i < input->size[0]; i++) {
+ int shardId = omp_get_thread_num();
+ long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+
+ if (offset >= 0 && offset < inDim) {
+ THBlas_(axpy)(outDim,
+ THTensor_(get2d)(input, i, 1),
+ THTensor_(data)(weight) + offset * weight->stride[1],
+ weight->stride[0],
+ THTensor_(data)(buffer) + shardId * buffer->stride[1],
+ buffer->stride[0]);
+ } else {
+ luaL_error(L, "index out of bound. updateOutput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+ }
+ }
+
+ THTensor_(sum)(output, buffer, 1);
+ THTensor_(cadd)(output, bias, 1.0, output);
+
+ lua_getfield(L, 1, "output");
+ return 1;
+ }
THTensor_(copy)(output, bias);
for(i = 0; i < input->size[0]; i++)
{
long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
- if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */
+ if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
{
real val = THTensor_(get2d)(input, i, 1);
- THBlas_(axpy)(output->size[0],
- val,
+ THBlas_(axpy)(output->size[0],
+ val,
THTensor_(data)(weight)+offset*weight->stride[1],
- weight->stride[0],
- THTensor_(data)(output),
+ weight->stride[0],
+ THTensor_(data)(output),
output->stride[0]);
}
else {
- printf("\nupdateOutput: %ld not between 1 and %ld\n", offset+1, dim);
- luaL_error(L, "index out of bound");
+ luaL_error(L, "index out of bound. updateOutput: \
+%ld not between 1 and %ld", offset + 1, inDim);
}
}
+
+ lua_getfield(L, 1, "output");
return 1;
}
@@ -42,39 +99,47 @@ static int nn_(SparseLinear_accGradParameters)(lua_State *L)
THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
- THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_Tensor);
real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay");
- long dim = gradWeight->size[1]; /* number of weights.. */
- for(i = 0; i < input->size[0]; i++)
+ long nnz = input->size[0];
+ long outDim = weight->size[0];
+ long inDim = weight->size[1];
+
+ luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
+ luaL_argcheck(
+ L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+ luaL_argcheck(
+ L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
+ luaL_argcheck(
+ L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+ #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+ for(i = 0; i < nnz; i++)
{
long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
- if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */
+ if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
{
real val = scale*THTensor_(get2d)(input, i, 1);
-
- THBlas_(axpy)(gradOutput->size[0],
- val,
- THTensor_(data)(gradOutput),
- gradOutput->stride[0],
- THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
+
+ THBlas_(axpy)(outDim,
+ val,
+ THTensor_(data)(gradOutput),
+ gradOutput->stride[0],
+ THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
gradWeight->stride[0]);
}
else {
- printf("\naccGradParameters: %ld not between 1 and %ld\n", offset+1, dim);
- luaL_error(L, "index out of bound");
+ luaL_error(L, "index out of bound. accGradParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
}
}
-
- THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
-
+
+ THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
if(weightDecay != 0)
THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
-
- THTensor_(resizeAs)(lastInput, input);
- THTensor_(copy)(lastInput, input);
-
+
return 0;
}
@@ -85,37 +150,137 @@ int nn_(SparseLinear_updateParameters)(lua_State *L)
THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
- THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
- THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_Tensor);
-
- long dim = weight->size[1]; /* number of weights.. */
+ THTensor * gradWeight = luaT_getfieldcheckudata(
+ L, 1, "gradWeight", torch_Tensor);
+ THTensor * lastInput = luaT_getfieldcheckudata(
+ L, 1, "lastInput", torch_Tensor);
+
+ long nnz = lastInput->size[0];
+ long outDim = weight->size[0];
+ long inDim = weight->size[1];
+
+ luaL_argcheck(
+ L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
+ luaL_argcheck(
+ L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+ luaL_argcheck(
+ L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
THTensor_(cadd)(bias, bias, -learningRate, gradBias);
-
- for(i = 0; i < lastInput->size[0]; i++)
+
+ #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
+ for(i = 0; i < nnz; i++)
{
long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
-
- if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */
+
+ if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
{
- THBlas_(axpy)(bias->size[0],
- -learningRate,
- THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
- gradWeight->stride[0],
- THTensor_(data)(weight)+offset*weight->stride[1],
+ real* pGradWeight =
+ THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+ THBlas_(axpy)(outDim,
+ -learningRate,
+ pGradWeight,
+ gradWeight->stride[0],
+ THTensor_(data)(weight)+offset*weight->stride[1],
weight->stride[0]);
}
else {
- printf("\nupdateParameters: %ld not between 1 and %ld\n", offset+1, dim);
- luaL_error(L, "index out of bound");
+ luaL_error(L, "index out of bound. updateParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
+ }
+ }
+ return 0;
+}
+
+int nn_(SparseLinear_zeroGradParameters)(lua_State *L)
+{
+ long i;
+ THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+ THTensor * gradWeight = luaT_getfieldcheckudata(
+ L, 1, "gradWeight", torch_Tensor);
+ THTensor * lastInput = luaT_getfieldcheckudata(
+ L, 1, "lastInput", torch_Tensor);
+
+ long nnz = lastInput->size[0];
+ long outDim = gradWeight->size[0];
+ long inDim = gradWeight->size[1];
+
+ luaL_argcheck(
+ L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+ THTensor_(zero)(gradBias);
+ #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
+ for(i = 0; i < nnz; i++)
+ {
+ long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+
+ if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+ {
+ real* pGradWeight =
+ THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+ if(gradWeight->stride[0] == 1) {
+ THVector_(fill)(pGradWeight, 0, outDim);
+ } else {
+ long j;
+ for(j = 0; j < outDim; ++j) {
+ pGradWeight[j * gradWeight->stride[0]] = 0;
+ }
+ }
+ }
+ else {
+ luaL_error(L, "index out of bound. zeroGradParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
}
}
return 0;
}
+static int nn_(SparseLinear_updateGradInput)(lua_State *L) {
+ THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+ THTensor *gradInput =
+ luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+ THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+ THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+
+ long i;
+ long nnz = input->size[0];
+ long outDim = weight->size[0];
+ long inDim = weight->size[1];
+
+ luaL_argcheck(
+ L, nn_(checkInput)(input), 2, "input must be an nnz x 2 tensor");
+ luaL_argcheck(
+ L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+
+ THTensor_(resize2d)(gradInput, input->size[0], input->size[1]);
+
+ #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+ for (i = 0; i < nnz; ++i) {
+ long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+ THTensor_(set2d)(gradInput, i, 0, offset + 1);
+
+ if (offset >= 0 && offset < inDim) {
+ real val =
+ THBlas_(dot)(outDim,
+ THTensor_(data)(gradOutput),
+ gradOutput->stride[0],
+ THTensor_(data)(weight) + offset * weight->stride[1],
+ weight->stride[0]);
+ THTensor_(set2d)(gradInput, i, 1, val);
+ } else {
+ luaL_error(L, "index out of bound. updateGradInput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+ }
+ }
+ return 0;
+}
+
static const struct luaL_Reg nn_(SparseLinear__) [] = {
{"SparseLinear_updateOutput", nn_(SparseLinear_updateOutput)},
{"SparseLinear_accGradParameters", nn_(SparseLinear_accGradParameters)},
{"SparseLinear_updateParameters", nn_(SparseLinear_updateParameters)},
+ {"SparseLinear_zeroGradParameters", nn_(SparseLinear_zeroGradParameters)},
+ {"SparseLinear_updateGradInput", nn_(SparseLinear_updateGradInput)},
{NULL, NULL}
};