diff options
author | soumith <soumith@fb.com> | 2015-01-03 07:42:12 +0300 |
---|---|---|
committer | soumith <soumith@fb.com> | 2015-01-03 07:42:12 +0300 |
commit | a38407a57def785acc819066db70f1649da47f03 (patch) | |
tree | 9aaa885fb28188a7c17fca6bcfe9e527f3930904 | |
parent | 2340b9c068b518cdc20b0c6c1a9b68971f0e97e8 (diff) |
speedup and optimizations for SparseLinear
-rw-r--r-- | SparseLinear.lua | 37 | ||||
-rw-r--r-- | generic/SparseLinear.c | 249 |
2 files changed, 226 insertions, 60 deletions
diff --git a/SparseLinear.lua b/SparseLinear.lua index 735d0ed..ca15be6 100644 --- a/SparseLinear.lua +++ b/SparseLinear.lua @@ -4,11 +4,16 @@ function SparseLinear:__init(inputSize, outputSize) parent.__init(self) self.weightDecay = 0 - self.weight = torch.Tensor(outputSize, inputSize) - self.bias = torch.Tensor(outputSize) - self.gradWeight = torch.Tensor(outputSize, inputSize) - self.gradBias = torch.Tensor(outputSize) - self.lastInput = torch.Tensor() + self.weight = torch.Tensor(outputSize, inputSize):zero() + self.bias = torch.Tensor(outputSize):zero() + self.gradWeight = torch.Tensor(outputSize, inputSize):zero() + self.gradBias = torch.Tensor(outputSize):zero() + self.lastInput = nil + + if torch.getnumthreads() > 1 and outputSize >= 128 then + self.shardBuffer = torch.Tensor(outputSize, torch.getnumthreads()) + end + -- state self.gradInput:resize(inputSize) self.output:resize(outputSize) @@ -20,7 +25,7 @@ function SparseLinear:reset(stdv) if stdv then stdv = stdv * math.sqrt(3) else - stdv = 1./math.sqrt(self.weight:size(1)) + stdv = 1./math.sqrt(self.weight:size(2)) end if nn.oldSeed then for i=1,self.weight:size(1) do @@ -40,22 +45,18 @@ function SparseLinear:updateOutput(input) end function SparseLinear:accGradParameters(input, gradOutput, scale) + if not self.lastInput then + self.lastInput = input:clone() + else + self.lastInput:resizeAs(input):copy(input) + end + return input.nn.SparseLinear_accGradParameters(self, input, gradOutput, scale) end function SparseLinear:updateGradInput(input, gradOutput) if self.gradInput then - self.gradInput:resize(input:size()) - self.gradInput:copy(input) - local numNonzero = self.gradInput:size(1) - for e=1,numNonzero do - local g = 0 - local i = self.gradInput[{e,1}] - for j=1,self.output:size(1) do - g = g + self.weight[{j,i}] * gradOutput[j] - end - self.gradInput[{e,2}] = g - end + input.nn.SparseLinear_updateGradInput(self, input, gradOutput) return self.gradInput end -end
\ No newline at end of file +end diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c index f39791b..b3ccbf1 100644 --- a/generic/SparseLinear.c +++ b/generic/SparseLinear.c @@ -2,6 +2,18 @@ #define TH_GENERIC_FILE "generic/SparseLinear.c" #else +static int nn_(checkInput)(THTensor* t) { + return t->nDimension == 2 && t->size[1] == 2; +} + +static int nn_(checkSize2D)(THTensor* t, long size0, long size1) { + return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; +} + +static int nn_(checkSize1D)(THTensor* t, long size0) { + return t->nDimension == 1 && t->size[0] == size0; +} + static int nn_(SparseLinear_updateOutput)(lua_State *L) { long i; @@ -9,27 +21,72 @@ static int nn_(SparseLinear_updateOutput)(lua_State *L) THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); - long dim = weight->size[1]; /* number of weights.. */ + + long outDim = weight->size[0]; + long inDim = weight->size[1]; + + luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2"); + luaL_argcheck(L, nn_(checkSize1D)(output, outDim), 1, "output size wrong"); + luaL_argcheck(L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong"); + + lua_getfield(L, 1, "shardBuffer"); + if (!lua_isnil(L, -1)) { + THTensor *buffer = + luaT_getfieldcheckudata(L, 1, "shardBuffer", torch_Tensor); + long num_shards = buffer->size[1]; + luaL_argcheck(L, + buffer->nDimension == 2 && buffer->size[0] == outDim && + num_shards > 0, + 1, + "shardBuffer size wrong"); + + THTensor_(zero)(buffer); + #pragma omp parallel for private(i) schedule(static) num_threads(num_shards) + for (i = 0; i < input->size[0]; i++) { + int shardId = omp_get_thread_num(); + long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1; + + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + THTensor_(get2d)(input, i, 1), + THTensor_(data)(weight) + offset * weight->stride[1], + weight->stride[0], + THTensor_(data)(buffer) + shardId * buffer->stride[1], + buffer->stride[0]); + } else { + luaL_error(L, "index out of bound. updateOutput: \ +%ld not between 1 and %ld", offset + 1, inDim); + } + } + + THTensor_(sum)(output, buffer, 1); + THTensor_(cadd)(output, bias, 1.0, output); + + lua_getfield(L, 1, "output"); + return 1; + } THTensor_(copy)(output, bias); for(i = 0; i < input->size[0]; i++) { long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1; - if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */ + if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */ { real val = THTensor_(get2d)(input, i, 1); - THBlas_(axpy)(output->size[0], - val, + THBlas_(axpy)(output->size[0], + val, THTensor_(data)(weight)+offset*weight->stride[1], - weight->stride[0], - THTensor_(data)(output), + weight->stride[0], + THTensor_(data)(output), output->stride[0]); } else { - printf("\nupdateOutput: %ld not between 1 and %ld\n", offset+1, dim); - luaL_error(L, "index out of bound"); + luaL_error(L, "index out of bound. updateOutput: \ +%ld not between 1 and %ld", offset + 1, inDim); } } + + lua_getfield(L, 1, "output"); return 1; } @@ -42,39 +99,47 @@ static int nn_(SparseLinear_accGradParameters)(lua_State *L) THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); - THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_Tensor); real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay"); - long dim = gradWeight->size[1]; /* number of weights.. */ - for(i = 0; i < input->size[0]; i++) + long nnz = input->size[0]; + long outDim = weight->size[0]; + long inDim = weight->size[1]; + + luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2"); + luaL_argcheck( + L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong"); + luaL_argcheck( + L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong"); + luaL_argcheck( + L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong"); + + #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000) + for(i = 0; i < nnz; i++) { long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1; - if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */ + if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */ { real val = scale*THTensor_(get2d)(input, i, 1); - - THBlas_(axpy)(gradOutput->size[0], - val, - THTensor_(data)(gradOutput), - gradOutput->stride[0], - THTensor_(data)(gradWeight)+offset*gradWeight->stride[1], + + THBlas_(axpy)(outDim, + val, + THTensor_(data)(gradOutput), + gradOutput->stride[0], + THTensor_(data)(gradWeight)+offset*gradWeight->stride[1], gradWeight->stride[0]); } else { - printf("\naccGradParameters: %ld not between 1 and %ld\n", offset+1, dim); - luaL_error(L, "index out of bound"); + luaL_error(L, "index out of bound. accGradParameters: \ +%ld not between 1 and %ld", offset + 1, inDim); } } - - THTensor_(cadd)(gradBias, gradBias, scale, gradOutput); - + + THTensor_(cadd)(gradBias, gradBias, scale, gradOutput); + if(weightDecay != 0) THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); - - THTensor_(resizeAs)(lastInput, input); - THTensor_(copy)(lastInput, input); - + return 0; } @@ -85,37 +150,137 @@ int nn_(SparseLinear_updateParameters)(lua_State *L) THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); - THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); - THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_Tensor); - - long dim = weight->size[1]; /* number of weights.. */ + THTensor * gradWeight = luaT_getfieldcheckudata( + L, 1, "gradWeight", torch_Tensor); + THTensor * lastInput = luaT_getfieldcheckudata( + L, 1, "lastInput", torch_Tensor); + + long nnz = lastInput->size[0]; + long outDim = weight->size[0]; + long inDim = weight->size[1]; + + luaL_argcheck( + L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong"); + luaL_argcheck( + L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong"); + luaL_argcheck( + L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong"); + THTensor_(cadd)(bias, bias, -learningRate, gradBias); - - for(i = 0; i < lastInput->size[0]; i++) + + #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000) + for(i = 0; i < nnz; i++) { long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1; - - if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */ + + if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */ { - THBlas_(axpy)(bias->size[0], - -learningRate, - THTensor_(data)(gradWeight)+offset*gradWeight->stride[1], - gradWeight->stride[0], - THTensor_(data)(weight)+offset*weight->stride[1], + real* pGradWeight = + THTensor_(data)(gradWeight)+offset*gradWeight->stride[1]; + THBlas_(axpy)(outDim, + -learningRate, + pGradWeight, + gradWeight->stride[0], + THTensor_(data)(weight)+offset*weight->stride[1], weight->stride[0]); } else { - printf("\nupdateParameters: %ld not between 1 and %ld\n", offset+1, dim); - luaL_error(L, "index out of bound"); + luaL_error(L, "index out of bound. updateParameters: \ +%ld not between 1 and %ld", offset + 1, inDim); + } + } + return 0; +} + +int nn_(SparseLinear_zeroGradParameters)(lua_State *L) +{ + long i; + THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); + THTensor * gradWeight = luaT_getfieldcheckudata( + L, 1, "gradWeight", torch_Tensor); + THTensor * lastInput = luaT_getfieldcheckudata( + L, 1, "lastInput", torch_Tensor); + + long nnz = lastInput->size[0]; + long outDim = gradWeight->size[0]; + long inDim = gradWeight->size[1]; + + luaL_argcheck( + L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong"); + + THTensor_(zero)(gradBias); + #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000) + for(i = 0; i < nnz; i++) + { + long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1; + + if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */ + { + real* pGradWeight = + THTensor_(data)(gradWeight)+offset*gradWeight->stride[1]; + if(gradWeight->stride[0] == 1) { + THVector_(fill)(pGradWeight, 0, outDim); + } else { + long j; + for(j = 0; j < outDim; ++j) { + pGradWeight[j * gradWeight->stride[0]] = 0; + } + } + } + else { + luaL_error(L, "index out of bound. zeroGradParameters: \ +%ld not between 1 and %ld", offset + 1, inDim); } } return 0; } +static int nn_(SparseLinear_updateGradInput)(lua_State *L) { + THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); + THTensor *gradInput = + luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor); + THTensor *input = luaT_checkudata(L, 2, torch_Tensor); + THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); + + long i; + long nnz = input->size[0]; + long outDim = weight->size[0]; + long inDim = weight->size[1]; + + luaL_argcheck( + L, nn_(checkInput)(input), 2, "input must be an nnz x 2 tensor"); + luaL_argcheck( + L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong"); + + THTensor_(resize2d)(gradInput, input->size[0], input->size[1]); + + #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000) + for (i = 0; i < nnz; ++i) { + long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1; + THTensor_(set2d)(gradInput, i, 0, offset + 1); + + if (offset >= 0 && offset < inDim) { + real val = + THBlas_(dot)(outDim, + THTensor_(data)(gradOutput), + gradOutput->stride[0], + THTensor_(data)(weight) + offset * weight->stride[1], + weight->stride[0]); + THTensor_(set2d)(gradInput, i, 1, val); + } else { + luaL_error(L, "index out of bound. updateGradInput: \ +%ld not between 1 and %ld", offset + 1, inDim); + } + } + return 0; +} + static const struct luaL_Reg nn_(SparseLinear__) [] = { {"SparseLinear_updateOutput", nn_(SparseLinear_updateOutput)}, {"SparseLinear_accGradParameters", nn_(SparseLinear_accGradParameters)}, {"SparseLinear_updateParameters", nn_(SparseLinear_updateParameters)}, + {"SparseLinear_zeroGradParameters", nn_(SparseLinear_zeroGradParameters)}, + {"SparseLinear_updateGradInput", nn_(SparseLinear_updateGradInput)}, {NULL, NULL} }; |