diff options
author | Ronan Collobert <ronan@collobert.com> | 2012-09-26 21:55:50 +0400 |
---|---|---|
committer | Ronan Collobert <ronan@collobert.com> | 2012-09-26 21:55:50 +0400 |
commit | ada8bcb9c6251b54f2241ca0aa4b45742fb6768a (patch) | |
tree | 9df979b8f7c01eab6d27cf33bf8bda12d4db87bd /generic | |
parent | 9614cd41480f7d2c1382f33924ad168c32b03828 (diff) |
surfing with the gflops
Diffstat (limited to 'generic')
-rw-r--r-- | generic/SpatialConvolutionMM.c | 326 |
1 files changed, 269 insertions, 57 deletions
diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c index f4d5d43..7ef02e7 100644 --- a/generic/SpatialConvolutionMM.c +++ b/generic/SpatialConvolutionMM.c @@ -2,6 +2,125 @@ #define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c" #else +extern void mkl_set_num_threads(int); + +static void nn_(unfolded_copy)(THTensor *finput, THTensor *input, + int kW, int kH, + int nInputPlane, + int inputWidth, int inputHeight, + int outputWidth, int outputHeight) +{ + int nip; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(nip) + for(nip = 0; nip < nInputPlane; nip++) + { + int kw, kh, y, x; + for(kh = 0; kh < kH; kh++) + { + for(kw = 0; kw < kW; kw++) + { + real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); + real *src = input_data + nip*(inputHeight*inputWidth) + kh*inputWidth + kw; + for(y = 0; y < outputHeight; y++) + memcpy(dst+y*outputWidth, src+y*inputWidth, sizeof(real)*outputWidth); +// THBlas_(copy)(outputWidth, src+y*inputWidth, 1, dst+y*outputWidth, 1); +// for(x = 0; x < outputWidth; x++) +// (dst+y*outputWidth)[x] = (src+y*inputWidth)[x]; + } + } + } +} + +/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy2 */ +static void nn_(unfolded_acc)(THTensor *finput, THTensor *input, + int kW, int kH, + int nInputPlane, + int inputWidth, int inputHeight, + int outputWidth, int outputHeight) +{ + int nip; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(nip) + for(nip = 0; nip < nInputPlane; nip++) + { + int kw, kh, y, x; + for(kh = 0; kh < kH; kh++) + { + for(kw = 0; kw < kW; kw++) + { + real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); + real *dst = input_data + nip*(inputHeight*inputWidth) + kh*inputWidth + kw; + for(y = 0; y < outputHeight; y++) + THVector_(add)(dst+y*inputWidth, src+y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */ + } + } + } +} + +static void nn_(unfolded_copy2)(THTensor *finput, THTensor *input, + int kW, int kH, + int nInputPlane, + int inputWidth, int inputHeight, + int outputWidth, int outputHeight) +{ + long k; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane*kH*kW; k++) + { + int nip = k / (kH*kW); + int rest = k % (kH*kW); + int kh = rest / kW; + int kw = rest % kW; + int y; + real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); + real *src = input_data + nip*(inputHeight*inputWidth) + kh*inputWidth + kw; + for(y = 0; y < outputHeight; y++) + memcpy(dst+y*outputWidth, src+y*inputWidth, sizeof(real)*outputWidth); +// THBlas_(copy)(outputWidth, src+y*inputWidth, 1, dst+y*outputWidth, 1); +// for(x = 0; x < outputWidth; x++) +// (dst+y*outputWidth)[x] = (src+y*inputWidth)[x]; + } +} + +static void nn_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, + int kW, int kH, int dW, int dH, + long nInputPlane, long inputWidth, long inputHeight, + long nOutputPlane, long outputWidth, long outputHeight) +{ + long i; + +#if 1 + nn_(unfolded_copy2)(finput, input, kW, kH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); +#else + THTensor *unfoldedInput = THTensor_(new)(); + THTensor_(unfold)(unfoldedInput, input, 1, kH, dH); + THTensor_(unfold)(unfoldedInput, unfoldedInput, 2, kW, dW); + THTensor_(transpose)(unfoldedInput, unfoldedInput, 1, 3); + THTensor_(transpose)(unfoldedInput, unfoldedInput, 2, 4); + THTensor_(copy)(finput, unfoldedInput); + THTensor_(free)(unfoldedInput); +#endif + + THTensor *output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset, + nOutputPlane, -1, + outputHeight*outputWidth, -1); + + for(i = 0; i < nOutputPlane; i++) + THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); + + THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); + + THTensor_(free)(output2d); +} + static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_Tensor); @@ -9,7 +128,6 @@ static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L) int kH = luaT_getfieldcheckint(L, 1, "kH"); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); - long i; THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor); THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); @@ -18,45 +136,93 @@ static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L) luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected"); + int dimf = 0; int dimw = 2; int dimh = 1; if (input->nDimension == 4) { - THError("batch not yet supported"); + dimf++; + dimw++; + dimh++; } - long nInputPlane = input->size[0]; + long nInputPlane = input->size[dimf]; long inputWidth = input->size[dimw]; long inputHeight = input->size[dimh]; long nOutputPlane = weight->size[0]; long outputWidth = (inputWidth - kW) / dW + 1; long outputHeight = (inputHeight - kH) / dH + 1; - - THTensor *unfoldedInput = THTensor_(new)(); - THTensor_(unfold)(unfoldedInput, input, 1, kH, dH); - THTensor_(unfold)(unfoldedInput, unfoldedInput, 2, kW, dW); - THTensor_(transpose)(unfoldedInput, unfoldedInput, 1, 3); - THTensor_(transpose)(unfoldedInput, unfoldedInput, 2, 4); - THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); - THTensor_(copy)(finput, unfoldedInput); - - THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); - THTensor *output2d = THTensor_(newWithStorage2d)(output->storage, 0, - nOutputPlane, -1, - outputHeight*outputWidth, -1); + if(input->nDimension == 3) + { + THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + + nn_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput, + kW, kH, dW, dH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + long T = input->size[0]; + long t; + + THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth); + + THStorage_(clearFlag)(input->storage, TH_STORAGE_REFCOUNTED); + THStorage_(clearFlag)(output->storage, TH_STORAGE_REFCOUNTED); + THStorage_(clearFlag)(finput->storage, TH_STORAGE_REFCOUNTED); +// mkl_set_num_threads(1); +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + nn_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t, + kW, kH, dW, dH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + THStorage_(setFlag)(input->storage, TH_STORAGE_REFCOUNTED); + THStorage_(setFlag)(output->storage, TH_STORAGE_REFCOUNTED); + THStorage_(setFlag)(finput->storage, TH_STORAGE_REFCOUNTED); + } +// mkl_set_num_threads(4); - for(i = 0; i < nOutputPlane; i++) - THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); + return 1; +} - THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); - THTensor_(free)(output2d); - THTensor_(free)(unfoldedInput); +static void nn_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput, + int kW, int kH, int dW, int dH) +{ + THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); + THTensor_(free)(gradOutput2d); - return 1; + THTensor_(zero)(gradInput); +#if 1 + nn_(unfolded_acc)(fgradInput, gradInput, kW, kH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]); +#else + THTensor *unfoldedGradInput = THTensor_(new)(); + THTensor_(unfold)(unfoldedGradInput, gradInput, 1, kH, dH); + THTensor_(unfold)(unfoldedGradInput, unfoldedGradInput, 2, kW, dW); + THTensor_(transpose)(unfoldedGradInput, unfoldedGradInput, 1, 3); + THTensor_(transpose)(unfoldedGradInput, unfoldedGradInput, 2, 4); + THTensor_(cadd)(unfoldedGradInput, unfoldedGradInput, 1, fgradInput); + THTensor_(free)(unfoldedGradInput); +#endif } - static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_Tensor); @@ -74,33 +240,77 @@ static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L) THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" ); - THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, 0, - gradOutput->size[0], -1, - gradOutput->size[1]*gradOutput->size[2], -1); - + THTensor_(resizeAs)(gradInput, input); THTensor_(resizeAs)(fgradInput, finput); - THTensor_(zero)(fgradInput); THTensor_(transpose)(weight, weight, 0, 1); - THTensor_(addmm)(fgradInput, 1, fgradInput, 1, weight, gradOutput2d); - THTensor_(transpose)(weight, weight, 0, 1); - - THTensor_(resizeAs)(gradInput, input); - THTensor_(zero)(gradInput); - THTensor *unfoldedGradInput = THTensor_(new)(); - THTensor_(unfold)(unfoldedGradInput, gradInput, 1, kH, dH); - THTensor_(unfold)(unfoldedGradInput, unfoldedGradInput, 2, kW, dW); - THTensor_(transpose)(unfoldedGradInput, unfoldedGradInput, 1, 3); - THTensor_(transpose)(unfoldedGradInput, unfoldedGradInput, 2, 4); + if(input->nDimension == 3) + { + nn_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, + kW, kH, dW, dH); + } + else + { + long T = input->size[0]; + long t; + + THStorage_(clearFlag)(gradInput->storage, TH_STORAGE_REFCOUNTED); + THStorage_(clearFlag)(gradOutput->storage, TH_STORAGE_REFCOUNTED); + THStorage_(clearFlag)(fgradInput->storage, TH_STORAGE_REFCOUNTED); + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + nn_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, + kW, kH, dW, dH); - THTensor_(cadd)(unfoldedGradInput, unfoldedGradInput, 1, fgradInput); - - THTensor_(free)(unfoldedGradInput); - THTensor_(free)(gradOutput2d); + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + + THStorage_(setFlag)(gradInput->storage, TH_STORAGE_REFCOUNTED); + THStorage_(setFlag)(gradOutput->storage, TH_STORAGE_REFCOUNTED); + THStorage_(setFlag)(fgradInput->storage, TH_STORAGE_REFCOUNTED); + } + + THTensor_(transpose)(weight, weight, 0, 1); return 1; } +static void nn_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, + real scale) +{ + long i; + + THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + THTensor_(transpose)(finput, finput, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput); + THTensor_(transpose)(finput, finput, 0, 1); + + THTensor *gradOutputPlane = THTensor_(new)(); + for(i = 0; i < gradBias->size[0]; i++) + { + long k; + real sum = 0; + real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; + for(k = 0; k < gradOutput2d->size[1]; k++) + sum += data[k]; + (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum; +// THTensor_(select)(gradOutputPlane, gradOutput2d, 0, i); +// (gradBias->storage->data + gradBias->storageOffset)[i] += scale*THTensor_(sumall)(gradOutputPlane); + } + + THTensor_(free)(gradOutputPlane); + THTensor_(free)(gradOutput2d); +} static int nn_(SpatialConvolutionMM_accGradParameters)(lua_State *L) { @@ -108,7 +318,6 @@ static int nn_(SpatialConvolutionMM_accGradParameters)(lua_State *L) THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); real scale = luaL_optnumber(L, 4, 1); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); - long i; THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor); THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); @@ -116,23 +325,26 @@ static int nn_(SpatialConvolutionMM_accGradParameters)(lua_State *L) THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" ); - THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, 0, - gradOutput->size[0], -1, - gradOutput->size[1]*gradOutput->size[2], -1); - - THTensor_(transpose)(finput, finput, 0, 1); - THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput); - THTensor_(transpose)(finput, finput, 0, 1); - - THTensor *gradOutputPlane = THTensor_(new)(); - for(i = 0; i < gradBias->size[0]; i++) + if(input->nDimension == 3) { - THTensor_(select)(gradOutputPlane, gradOutput2d, 0, i); - gradBias->storage->data[i] += scale*THTensor_(sumall)(gradOutputPlane); + nn_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale); } + else + { + long T = input->size[0]; + long t; - THTensor_(free)(gradOutputPlane); - THTensor_(free)(gradOutput2d); + for(t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + nn_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } return 0; } |