From fc3d560598bb4366ac6a8c3ad83e690870ae0cc7 Mon Sep 17 00:00:00 2001 From: Clement Farabet Date: Tue, 13 Mar 2012 15:03:39 -0400 Subject: Trying to merge openmp into main libs. --- generic/HardTanh.c | 82 +++++++++++++++--- generic/SpatialConvolution.c | 180 ++++++++++++++++++++-------------------- generic/SpatialConvolutionMap.c | 140 +++++++++++++++++-------------- generic/SpatialMaxPooling.c | 11 +-- generic/SpatialSubSampling.c | 127 +++++++++++++--------------- generic/Sqrt.c | 57 +++++++++++-- generic/Square.c | 57 +++++++++++-- generic/Tanh.c | 57 +++++++++++-- 8 files changed, 455 insertions(+), 256 deletions(-) (limited to 'generic') diff --git a/generic/HardTanh.c b/generic/HardTanh.c index 3764095..bfd1a42 100644 --- a/generic/HardTanh.c +++ b/generic/HardTanh.c @@ -8,14 +8,42 @@ static int nn_(HardTanh_updateOutput)(lua_State *L) THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id)); THTensor_(resizeAs)(output, input); + + if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + TH_TENSOR_APPLY2(real, output, real, input, \ + if(*input_data < -1) \ + *output_data = -1; \ + else if(*input_data <= 1) \ + *output_data = *input_data; \ + else \ + *output_data = 1;); + } + else + { + real* output_data = THTensor_(data)(output); + real* input_data = THTensor_(data)(input); + long k; - TH_TENSOR_APPLY2(real, output, real, input, \ - if(*input_data < -1) \ - *output_data = -1; \ - else if(*input_data <= 1) \ - *output_data = *input_data; \ - else \ - *output_data = 1;) + +#pragma omp parallel for private(k) + for (k = 0; k < input->size[0]; k++) + { + real* ptr_output = output_data + k*input->stride[0]; + real* ptr_input = input_data + k*input->stride[0]; + long i; + for (i = 0; i < input->stride[0]; i++) + { + if(ptr_input[i] < -1) + ptr_output[i] = -1; + else if (ptr_input[i] <= 1) + ptr_output[i] = ptr_input[i]; + else + ptr_output[i] = 1; + } + } + } + return 1; } @@ -26,11 +54,41 @@ static int nn_(HardTanh_updateGradInput)(lua_State *L) THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_(Tensor_id)); THTensor_(resizeAs)(gradInput, input); - TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \ - if(*input_data < -1 || *input_data > 1) \ - *gradInput_data = 0; \ - else \ - *gradInput_data = *gradOutput_data;); + + if (input->nDimension == 1 || + !THTensor_(isContiguous)(input) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \ + if(*input_data < -1 || *input_data > 1) \ + *gradInput_data = 0; \ + else \ + *gradInput_data = *gradOutput_data;); + } + else + { + real* gradOutput_data = THTensor_(data)(gradOutput); + real* gradInput_data = THTensor_(data)(gradInput); + real* input_data = THTensor_(data)(input); + long k; + +#pragma omp parallel for private(k) + for (k = 0; k < input->size[0]; k++) + { + real* ptr_gradOutput = gradOutput_data + k*input->stride[0]; + real* ptr_gradInput = gradInput_data + k*input->stride[0]; + real* ptr_input = input_data + k*input->stride[0]; + long i; + for (i = 0; i < input->stride[0]; i++) + { + if(ptr_input[i] < -1 || ptr_input[i] > 1) + ptr_gradInput[i] = 0; + else + ptr_gradInput[i] = ptr_gradOutput[i]; + } + } + } return 1; } diff --git a/generic/SpatialConvolution.c b/generic/SpatialConvolution.c index e9f2a7b..49ccc8d 100644 --- a/generic/SpatialConvolution.c +++ b/generic/SpatialConvolution.c @@ -2,24 +2,9 @@ #define TH_GENERIC_FILE "generic/SpatialConvolution.c" #else -static void nn_(convolution_updateOutput_)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, int dH, int dW) -{ - /* add bias */ - long i; - THTensor *outn = THTensor_(new)(); - for (i=0; isize[0]; i++) { - THTensor_(select)(outn,output,0,i); - THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); - } - THTensor_(free)(outn); - - /* do convolutions */ - THTensor_(conv2Dmv)(output, 1.0, 1.0, input, weight, dH, dW, "V","X"); -} - static int nn_(SpatialConvolution_updateOutput)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); @@ -47,49 +32,59 @@ static int nn_(SpatialConvolution_updateOutput)(lua_State *L) if (input->nDimension == 3) { THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); -/* printf("\n*************\nstochastic\n"); */ -/* printf("no=%d\n",output->nDimension); */ -/* printf("no=%ld,%ld,%ld\n",nOutputPlane,outputHeight,outputWidth); */ -/* printf("ni=%d\n",input->nDimension); */ - nn_(convolution_updateOutput_)(input,output,weight,bias,dH,dW); -/* printf("stochastic\n");*/ - } - else - { - THTensor_(resize4d)(output, input->size[0], nOutputPlane, outputHeight, outputWidth); - THTensor *outn = THTensor_(new)(); - THTensor *inpn = THTensor_(new)(); + /* add bias */ long i; - for (i=0; isize[0]; i++) + /*THTensor *outn = THTensor_(new)();*/ + real* bias_data = THTensor_(data)(bias); + real* output_data = THTensor_(data)(output); +#pragma omp parallel for private(i) + for (i=0; isize[0]; i++) { - THTensor_(select)(outn,output,0,i); - THTensor_(select)(inpn,input,0,i); - nn_(convolution_updateOutput_)(inpn,outn,weight,bias,dH,dW); + /*THTensor_(select)(outn,output,0,i);*/ + /*TH_TENSOR_APPLY(real,outn, *outn_data = bias_data[i];);*/ + real *ptr_output = output_data + i*outputWidth*outputHeight; + long j; + for(j = 0; j < outputWidth*outputHeight; j++) + ptr_output[j] = bias_data[i]; } - THTensor_(free)(outn); - THTensor_(free)(inpn); + /*THTensor_(free)(outn);*/ + + /* do convolutions */ + THTensor_(conv2Dmv)(output, 1.0, 1.0, input, weight, dH, dW, "V","X"); } + else + { + THTensor_(resize4d)(output, input->size[0], nOutputPlane, outputHeight, outputWidth); -/* /\* add bias *\/ */ -/* long i; */ -/* THTensor *outn = THTensor_(new)(); */ -/* for (i=0; isize[0]; i++) { */ -/* THTensor_(select)(outn,output,0,i); */ -/* THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); */ -/* } */ -/* THTensor_(free)(outn); */ + real* bias_data = THTensor_(data)(bias); + real* output_data = THTensor_(data)(output); -/* /\* do convolutions *\/ */ -/* THTensor_(conv2Dmv)(output, 1.0, 1.0, input, weight, dH, dW, "vx"); */ + long p; +#pragma omp parallel for private(p) + for (p=0; psize[0]; p++) + { + /* BIAS */ + long i; + for (i=0; isize[0]; i++) + { + real *ptr_output = output_data + p*nOutputPlane*outputWidth*outputHeight + i*outputWidth*outputHeight; + long j; + for(j = 0; j < outputWidth*outputHeight; j++) + ptr_output[j] = bias_data[i]; + } + } + /* do convolutions */ + THTensor_(conv2Dmm)(output, 1.0, 1.0, input, weight, dH, dW, "V","X"); + } return 1; } static int nn_(SpatialConvolution_updateGradInput)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); - THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); @@ -102,48 +97,18 @@ static int nn_(SpatialConvolution_updateGradInput)(lua_State *L) /* gradient to input */ THTensor *tweight = THTensor_(newTranspose)(weight,0,1); - if(input->nDimension == 3) + if (input->nDimension == 3) { - THTensor_(conv2Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dH, dW, "F", "C"); + THTensor_(conv2Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dH, dW, "F","C"); } else { - - THTensor_(resizeAs)(gradInput,input); - THTensor *outn = THTensor_(new)(); - THTensor *inpn = THTensor_(new)(); - long i; - for (i=0; isize[0]; i++) - { - THTensor_(select)(outn,gradOutput,0,i); - THTensor_(select)(inpn,gradInput,0,i); - THTensor_(conv2Dmv)(inpn, 0.0, 1.0, outn, tweight, dH, dW, "F", "C"); - } - THTensor_(free)(outn); - THTensor_(free)(inpn); + THTensor_(conv2Dmm)(gradInput, 0.0, 1.0, gradOutput, tweight, dH, dW, "F","C"); } THTensor_(free)(tweight); - return 1; } -static void nn_(convolution_accGradParameters_)(THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, real scale, int dH, int dW) -{ - long k; - - /* gradient to bias */ - real *gradBias_data = THTensor_(data)(gradBias); - THTensor* gradOutSlice = THTensor_(new)(); - for(k = 0; k < gradOutput->size[0]; k++) - { - THTensor_(select)(gradOutSlice, gradOutput, 0, k); - gradBias_data[k] += scale*THTensor_(sumall)(gradOutSlice); - } - THTensor_(free)(gradOutSlice); - - /* gradient to kernels */ - THTensor_(conv2DRevger)(gradWeight, 1.0, scale, input, gradOutput, dH, dW); -} static int nn_(SpatialConvolution_accGradParameters)(lua_State *L) { @@ -156,28 +121,59 @@ static int nn_(SpatialConvolution_accGradParameters)(lua_State *L) THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_(Tensor_id)); THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_(Tensor_id)); - + THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" ); - if(input->nDimension == 3) + int dimw = 2; + int dimh = 1; + + if (input->nDimension == 4) + { + dimw++; + dimh++; + } + + /* gradient to bias */ + real *gradBias_data = THTensor_(data)(gradBias); + real *gradOutput_data = THTensor_(data)(gradOutput); + long noutSlice = gradOutput->size[dimh]*gradOutput->size[dimw]; + /*THTensor* gradOutSlice = THTensor_(new)();*/ + + if (input->nDimension == 3) { - nn_(convolution_accGradParameters_)(input,gradOutput,gradWeight,gradBias,scale,dH,dW); + long k; +#pragma omp parallel for private(k) + for(k = 0; k < nOutputPlane; k++) + { + /*THTensor_(select)(gradOutSlice, gradOutput, 0, k);*/ + real *ptr_gradOutput = gradOutput_data + k*noutSlice; + long l; + for(l = 0; l < noutSlice; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; + } + + /* gradient to kernels */ + THTensor_(conv2DRevger)(gradWeight, 1.0, scale, input, gradOutput, dH, dW); } else { - THTensor *outn = THTensor_(new)(); - THTensor *inpn = THTensor_(new)(); - long i; - for (i=0; isize[0]; i++) + long k; +#pragma omp parallel for private(k) + for(k = 0; k < nOutputPlane; k++) { - THTensor_(select)(outn,gradOutput,0,i); - THTensor_(select)(inpn,input,0,i); - nn_(convolution_accGradParameters_)(inpn,outn,gradWeight,gradBias,scale,dH,dW); + long p; + for(p = 0; p < input->size[0]; p++) + { + /* BIAS */ + real *ptr_gradOutput = gradOutput_data + p*nOutputPlane*noutSlice + k*noutSlice; + long l; + for(l = 0; l < noutSlice; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; + } } - THTensor_(free)(outn); - THTensor_(free)(inpn); + /* gradient to kernels */ + THTensor_(conv2DRevgerm)(gradWeight, 1.0, scale, input, gradOutput, dH, dW); } - return 0; } diff --git a/generic/SpatialConvolutionMap.c b/generic/SpatialConvolutionMap.c index 2fa11c5..81117f4 100644 --- a/generic/SpatialConvolutionMap.c +++ b/generic/SpatialConvolutionMap.c @@ -4,7 +4,7 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int dW = luaT_getfieldcheckint(L, 1, "dW"); @@ -22,7 +22,7 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L) luaL_argcheck(L, input->size[2] >= kW && input->size[1] >= kH, 2, "input image smaller than kernel size"); THTensor_(resize3d)(output, nOutputPlane, - (input->size[1] - kH) / dH + 1, + (input->size[1] - kH) / dH + 1, (input->size[2] - kW) / dW + 1); // contiguous @@ -33,6 +33,8 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L) real *input_data = THTensor_(data)(input); real *output_data = THTensor_(data)(output); real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *connTable_data = THTensor_(data)(connTable); // and dims long input_h = input->size[1]; @@ -42,29 +44,32 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L) long weight_h = weight->size[1]; long weight_w = weight->size[2]; - // add bias - THTensor *outputPlane = THTensor_(new)(); - int k; - for (k = 0; k < nOutputPlane; k++) { - THTensor_(select)(outputPlane,output,0,k); - THTensor_(fill)(outputPlane, THTensor_(get1d)(bias, k)); - } - THTensor_(free)(outputPlane); - - // convolve all maps - int i,o; - int nweight = connTable->size[0]; - for (k = 0; k < nweight; k++) { - // get offsets for input/output - o = (int)THTensor_(get2d)(connTable,k,1)-1; - i = (int)THTensor_(get2d)(connTable,k,0)-1; - - // convolve each map - THTensor_(validXCorr2Dptr)(output_data + o*output_w*output_h, - 1.0, - input_data + i*input_w*input_h, input_h, input_w, - weight_data + k*weight_w*weight_h, weight_h, weight_w, - dH, dW); + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nOutputPlane; p++) { + // add bias + real *ptr_output = output_data + p*output_w*output_h; + long j; + for(j = 0; j < output_h*output_w; j++) + ptr_output[j] = bias_data[p]; + + // convolve all maps + int nweight = connTable->size[0]; + long k; + for (k = 0; k < nweight; k++) { + // get offsets for input/output + int o = (int)connTable_data[k*2+1]-1; + int i = (int)connTable_data[k*2+0]-1; + + if (o == p) + { + THTensor_(validXCorr2Dptr)(output_data + o*output_w*output_h, + 1.0, + input_data + i*input_w*input_h, input_h, input_w, + weight_data + k*weight_w*weight_h, weight_h, weight_w, + dH, dW); + } + } } // clean up @@ -76,10 +81,11 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L) static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); - THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); + int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_(Tensor_id)); THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id)); @@ -97,6 +103,7 @@ static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L) real *gradInput_data = THTensor_(data)(gradInput); real *gradOutput_data = THTensor_(data)(gradOutput); real *weight_data = THTensor_(data)(weight); + real *connTable_data = THTensor_(data)(connTable); // and dims long input_h = input->size[1]; @@ -106,33 +113,40 @@ static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L) long weight_h = weight->size[1]; long weight_w = weight->size[2]; - // updateGradInput all - int k; - int nkernel = connTable->size[0]; - for(k = 0; k < nkernel; k++) - { - int o = (int)THTensor_(get2d)(connTable,k,1)-1; - int i = (int)THTensor_(get2d)(connTable,k,0)-1; - - // gradient to input - THTensor_(fullConv2Dptr)(gradInput_data + i*input_w*input_h, - 1.0, - gradOutput_data + o*output_w*output_h, output_h, output_w, - weight_data + k*weight_w*weight_h, weight_h, weight_w, - dH, dW); - } + long p; +#pragma omp parallel for private(p) + for(p = 0; p < nInputPlane; p++) + { + long k; + // backward all + int nkernel = connTable->size[0]; + for(k = 0; k < nkernel; k++) + { + int o = (int)connTable_data[k*2+1]-1; + int i = (int)connTable_data[k*2+0]-1; + if (i == p) + { + // gradient to input + THTensor_(fullConv2Dptr)(gradInput_data + i*input_w*input_h, + 1.0, + gradOutput_data + o*output_w*output_h, output_h, output_w, + weight_data + k*weight_w*weight_h, weight_h, weight_w, + dH, dW); + } + } + } // clean up THTensor_(free)(gradInput); THTensor_(free)(gradOutput); - + return 1; } static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); - THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); @@ -151,6 +165,7 @@ static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L) real *input_data = THTensor_(data)(input); real *gradOutput_data = THTensor_(data)(gradOutput); real *gradWeight_data = THTensor_(data)(gradWeight); + real *gradBias_data = THTensor_(data)(gradBias); // and dims long input_h = input->size[1]; @@ -161,29 +176,30 @@ static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L) long weight_w = weight->size[2]; // gradients wrt bias - int k; - THTensor *gradOutputPlane = THTensor_(new)(); - real *gradBias_data = THTensor_(data)(gradBias); + long k; +#pragma omp parallel for private(k) for(k = 0; k < nOutputPlane; k++) { - THTensor_(select)(gradOutputPlane, gradOutput, 0, k); - gradBias_data[k] += scale * THTensor_(sumall)(gradOutputPlane); + real *ptr_gradOutput = gradOutput_data + k*output_w*output_h; + long l; + for(l = 0; l < output_h*output_w; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; } - THTensor_(free)(gradOutputPlane); // gradients wrt weight int nkernel = connTable->size[0]; +#pragma omp parallel for private(k) for(k = 0; k < nkernel; k++) - { - int o = (int)THTensor_(get2d)(connTable,k,1)-1; - int i = (int)THTensor_(get2d)(connTable,k,0)-1; - - // gradient to kernel - THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h, - scale, - input_data + i*input_w*input_h, input_h, input_w, - gradOutput_data + o*output_w*output_h, output_h, output_w, - dH, dW); - } + { + int o = (int)THTensor_(get2d)(connTable,k,1)-1; + int i = (int)THTensor_(get2d)(connTable,k,0)-1; + + // gradient to kernel + THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h, + scale, + input_data + i*input_w*input_h, input_h, input_w, + gradOutput_data + o*output_w*output_h, output_h, output_w, + dH, dW); + } // clean up THTensor_(free)(input); diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c index 2620530..ce2cdbc 100644 --- a/generic/SpatialMaxPooling.c +++ b/generic/SpatialMaxPooling.c @@ -38,6 +38,7 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L) // compute max pooling for each input slice long k; +#pragma omp parallel private(k) for (k = 0; k < nslices; k++) { // pointers to slices real *input_p = input_data + k*iwidth*iheight; @@ -56,9 +57,9 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L) real *indxp = indx_p + i*owidth + j; // compute local max: - long maxindex = -1; - real maxval = -THInf; - long tcntr = 0; + long maxindex = -1; + real maxval = -THInf; + long tcntr = 0; int x,y; for(y = 0; y < kH; y++) { for(x = 0; x < kW; x++) { @@ -130,8 +131,8 @@ static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L) for(i = 0; i < oheight; i++) { for(j = 0; j < owidth; j++) { // retrieve position of max - long maxi = *(indy_p + i*owidth + j) - 1 + i*dH; - long maxj = *(indx_p + i*owidth + j) - 1 + j*dW; + long maxi = *(indy_p + i*owidth + j) - 1 + i*dH; + long maxj = *(indx_p + i*owidth + j) - 1 + j*dW; // update gradient *(gradInput_p + maxi*iwidth + maxj) += *(gradOutput_p + i*owidth + j); diff --git a/generic/SpatialSubSampling.c b/generic/SpatialSubSampling.c index e5cbc3b..a1dde21 100644 --- a/generic/SpatialSubSampling.c +++ b/generic/SpatialSubSampling.c @@ -4,7 +4,7 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int dW = luaT_getfieldcheckint(L, 1, "dW"); @@ -20,12 +20,13 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L) real *output_data; real *input_data; - luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected"); int dimw = 2; int dimh = 1; + long nbatch = 1; if (input->nDimension == 4) { + nbatch = input->size[0]; dimw++; dimh++; } @@ -35,51 +36,42 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L) long outputWidth = (inputWidth - kW) / dW + 1; long outputHeight = (inputHeight - kH) / dH + 1; - luaL_argcheck(L, input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes"); luaL_argcheck(L, inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size"); - input = THTensor_(newContiguous)(input); - input_data = THTensor_(data)(input); - - long nbatch = 1; - if (input->nDimension == 3) - { + if (input->nDimension == 3) THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); - } else - { - nbatch = input->size[0]; - THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth); - } - + THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); output_data = THTensor_(data)(output); - - long i, k, p; - - for(p = 0; p < nbatch; p++) + + long k; +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) { - for(k = 0; k < nInputPlane; k++) + long p; + for(p = 0; p < nbatch; p++) { - real *ptr_output; long xx, yy; - + /* For all output pixels... */ + real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight; /* Get the good mask for (k,i) (k out, i in) */ real the_weight = weight_data[k]; - /* Initialize to the bias */ real z = bias_data[k]; + long i; for(i = 0; i < outputWidth*outputHeight; i++) - output_data[i] = z; + ptr_output[i] = z; - /* For all output pixels... */ - ptr_output = output_data; for(yy = 0; yy < outputHeight; yy++) { for(xx = 0; xx < outputWidth; xx++) { - /* Compute the mean of the input image... */ - real *ptr_input = input_data+yy*dH*inputWidth+xx*dW; + // Compute the mean of the input image... + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; real sum = 0; long kx, ky; @@ -87,20 +79,14 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L) { for(kx = 0; kx < kW; kx++) sum += ptr_input[kx]; - ptr_input += inputWidth; /* next input line */ + ptr_input += inputWidth; // next input line } - - /* Update output */ + // Update output *ptr_output++ += the_weight*sum; } } - - /* Next input/output plane */ - output_data += outputWidth*outputHeight; - input_data += inputWidth*inputHeight; } } - THTensor_(free)(input); return 1; @@ -108,8 +94,8 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L) static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L) { - THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); - THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); + THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); + THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int dW = luaT_getfieldcheckint(L, 1, "dW"); @@ -118,14 +104,14 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L) THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id)); THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_(Tensor_id)); - + int dimw = 2; int dimh = 1; long nbatch = 1; if (input->nDimension == 4) { + nbatch = input->size[0]; dimw++; dimh++; - nbatch = input->size[0]; } long inputWidth = input->size[dimw]; @@ -135,41 +121,46 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L) real *weight_data = THTensor_(data)(weight); real *gradOutput_data = THTensor_(data)(gradOutput); - real *gradInput_data; + real *input_data, *gradInput_data; + + input_data = THTensor_(data)(input); THTensor_(resizeAs)(gradInput, input); - THTensor_(zero)(gradInput); gradInput_data = THTensor_(data)(gradInput); gradOutput_data = THTensor_(data)(gradOutput); - long k, p; - - for(p = 0; p < nbatch; p++) + long k; +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) { - for(k = 0; k < nInputPlane; k++) + long p; + for(p = 0; p < nbatch; p++) { real the_weight = weight_data[k]; - real *ptr_gradOutput = gradOutput_data; + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; long xx, yy; - + + real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + long i; + for(i=0; inDimension == 4) { dimw++; dimh++; @@ -212,18 +203,21 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L) input = THTensor_(newContiguous)(input); input_data = THTensor_(data)(input); - long i, k, p; - for(p = 0; p < nbatch; p++) + long k; +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) { - for(k = 0; k < nInputPlane; k++) + long p; + for(p = 0; p < nbatch; p++) { - real *ptr_gradOutput = gradOutput_data; + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; real sum; long xx, yy; sum = 0; + long i; for(i = 0; i < outputWidth*outputHeight; i++) - sum += gradOutput_data[i]; + sum += ptr_gradOutput[i]; gradBias_data[k] += scale*sum; sum = 0; @@ -231,7 +225,7 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L) { for(xx = 0; xx < outputWidth; xx++) { - real *ptr_input = input_data+yy*dH*inputWidth+xx*dW; + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; real z = *ptr_gradOutput++; long kx, ky; @@ -240,16 +234,13 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L) for(kx = 0; kx < kW; kx++) sum += z * ptr_input[kx]; ptr_input += inputWidth; - } + } } } gradWeight_data[k] += scale*sum; - gradOutput_data += outputWidth*outputHeight; - input_data += inputWidth*inputHeight; } } - THTensor_(free)(input); return 0; diff --git a/generic/Sqrt.c b/generic/Sqrt.c index 3e0c3d9..952c260 100644 --- a/generic/Sqrt.c +++ b/generic/Sqrt.c @@ -9,10 +9,30 @@ static int nn_(Sqrt_updateOutput)(lua_State *L) THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id)); THTensor_(resizeAs)(output, input); + + if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + TH_TENSOR_APPLY2(real, output, real, input, \ + *output_data = sqrt(*input_data + bias);); + } + else + { + real* output_data = THTensor_(data)(output); + real* input_data = THTensor_(data)(input); + long k; - TH_TENSOR_APPLY2(real, output, real, input, \ - *output_data = sqrt(*input_data + bias);); - +#pragma omp parallel for private(k) + for (k = 0; k < input->size[0]; k++) + { + real* ptr_output = output_data + k*input->stride[0]; + real* ptr_input = input_data + k*input->stride[0]; + long i; + for (i = 0; i < input->stride[0]; i++) + { + ptr_output[i] = sqrt(ptr_input[i] + bias); + } + } + } return 1; } @@ -25,9 +45,34 @@ static int nn_(Sqrt_updateGradInput)(lua_State *L) THTensor_(resizeAs)(gradInput, input); - TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \ - *gradInput_data = 0.5 * (*gradOutput_data / *output_data);); - + if (output->nDimension == 1 || + !THTensor_(isContiguous)(output) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \ + *gradInput_data = 0.5 * (*gradOutput_data / *output_data);); + } + else + { + real* gradOutput_data = THTensor_(data)(gradOutput); + real* gradInput_data = THTensor_(data)(gradInput); + real* output_data = THTensor_(data)(output); + long k; + +#pragma omp parallel for private(k) + for (k = 0; k < output->size[0]; k++) + { + real* ptr_gradOutput = gradOutput_data + k*output->stride[0]; + real* ptr_gradInput = gradInput_data + k*output->stride[0]; + real* ptr_output = output_data + k*output->stride[0]; + long i; + for (i = 0; i < output->stride[0]; i++) + { + ptr_gradInput[i] = 0.5 * (ptr_gradOutput[i] / ptr_output[i]); + } + } + } return 1; } diff --git a/generic/Square.c b/generic/Square.c index 409055d..97baee3 100644 --- a/generic/Square.c +++ b/generic/Square.c @@ -6,12 +6,32 @@ static int nn_(Square_updateOutput)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id)); THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id)); - - THTensor_(resizeAs)(output, input); - TH_TENSOR_APPLY2(real, output, real, input, \ - *output_data = *input_data * *input_data;); + THTensor_(resizeAs)(output, input); + + if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + TH_TENSOR_APPLY2(real, output, real, input, \ + *output_data = (*input_data) * (*input_data);); + } + else + { + real* output_data = THTensor_(data)(output); + real* input_data = THTensor_(data)(input); + long k; +#pragma omp parallel for private(k) + for (k = 0; k < input->size[0]; k++) + { + real* ptr_output = output_data + k*input->stride[0]; + real* ptr_input = input_data + k*input->stride[0]; + long i; + for (i = 0; i < input->stride[0]; i++) + { + ptr_output[i] = ptr_input[i]*ptr_input[i]; + } + } + } return 1; } @@ -23,9 +43,34 @@ static int nn_(Square_updateGradInput)(lua_State *L) THTensor_(resizeAs)(gradInput, input); - TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \ - *gradInput_data = 2.0 * (*gradOutput_data) * (*input_data);); + if (input->nDimension == 1 || + !THTensor_(isContiguous)(input) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \ + *gradInput_data = (*gradOutput_data) * (*input_data);); + } + else + { + real* gradOutput_data = THTensor_(data)(gradOutput); + real* gradInput_data = THTensor_(data)(gradInput); + real* input_data = THTensor_(data)(input); + long k; +#pragma omp parallel for private(k) + for (k = 0; k < input->size[0]; k++) + { + real* ptr_gradOutput = gradOutput_data + k*input->stride[0]; + real* ptr_gradInput = gradInput_data + k*input->stride[0]; + real* ptr_input = input_data + k*input->stride[0]; + long i; + for (i = 0; i < input->stride[0]; i++) + { + ptr_gradInput[i] = 2.0 * ptr_gradOutput[i] * ptr_input[i]; + } + } + } return 1; } diff --git a/generic/Tanh.c b/generic/Tanh.c index 5c24d15..01e9bc0 100644 --- a/generic/Tanh.c +++ b/generic/Tanh.c @@ -9,9 +9,28 @@ static int nn_(Tanh_updateOutput)(lua_State *L) THTensor_(resizeAs)(output, input); - TH_TENSOR_APPLY2(real, output, real, input, \ - *output_data = tanh(*input_data);) + if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + TH_TENSOR_APPLY2(real, output, real, input, \ + *output_data = tanh(*input_data);); + } + else + { + real* output_data = THTensor_(data)(output); + real* input_data = THTensor_(data)(input); + long k; +#pragma omp parallel for private(k) + for (k = 0; k < input->size[0]; k++) + { + real* ptr_output = output_data + k*input->stride[0]; + real* ptr_input = input_data + k*input->stride[0]; + long i; + for (i = 0; i < input->stride[0]; i++) + ptr_output[i] = tanh(ptr_input[i]); + } + } + return 1; } @@ -22,9 +41,37 @@ static int nn_(Tanh_updateGradInput)(lua_State *L) THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_(Tensor_id)); THTensor_(resizeAs)(gradInput, output); - TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \ - real z = *output_data; \ - *gradInput_data = *gradOutput_data * (1. - z*z);); + + if (output->nDimension == 1 || + !THTensor_(isContiguous)(output) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \ + real z = *output_data; \ + *gradInput_data = *gradOutput_data * (1. - z*z);); + } + else + { + real* gradOutput_data = THTensor_(data)(gradOutput); + real* gradInput_data = THTensor_(data)(gradInput); + real* output_data = THTensor_(data)(output); + long k; + +#pragma omp parallel for private(k) + for (k = 0; k < output->size[0]; k++) + { + real* ptr_gradOutput = gradOutput_data + k*output->stride[0]; + real* ptr_gradInput = gradInput_data + k*output->stride[0]; + real* ptr_output = output_data + k*output->stride[0]; + long i; + for (i = 0; i < output->stride[0]; i++) + { + real z = ptr_output[i]; + ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z); + } + } + } return 1; } -- cgit v1.2.3