Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/nn.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClement Farabet <clement.farabet@gmail.com>2012-03-13 23:03:39 +0400
committerClement Farabet <clement.farabet@gmail.com>2012-03-13 23:03:39 +0400
commitfc3d560598bb4366ac6a8c3ad83e690870ae0cc7 (patch)
treef191bd4a0a15b06f05b8a927e3456346694b80c9 /generic
parente993b49c24e34b81851373e6a541d2f6a6febde0 (diff)
Trying to merge openmp into main libs.
Diffstat (limited to 'generic')
-rw-r--r--generic/HardTanh.c82
-rw-r--r--generic/SpatialConvolution.c180
-rw-r--r--generic/SpatialConvolutionMap.c140
-rw-r--r--generic/SpatialMaxPooling.c11
-rw-r--r--generic/SpatialSubSampling.c127
-rw-r--r--generic/Sqrt.c57
-rw-r--r--generic/Square.c57
-rw-r--r--generic/Tanh.c57
8 files changed, 455 insertions, 256 deletions
diff --git a/generic/HardTanh.c b/generic/HardTanh.c
index 3764095..bfd1a42 100644
--- a/generic/HardTanh.c
+++ b/generic/HardTanh.c
@@ -8,14 +8,42 @@ static int nn_(HardTanh_updateOutput)(lua_State *L)
THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id));
THTensor_(resizeAs)(output, input);
+
+ if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+ {
+ TH_TENSOR_APPLY2(real, output, real, input, \
+ if(*input_data < -1) \
+ *output_data = -1; \
+ else if(*input_data <= 1) \
+ *output_data = *input_data; \
+ else \
+ *output_data = 1;);
+ }
+ else
+ {
+ real* output_data = THTensor_(data)(output);
+ real* input_data = THTensor_(data)(input);
+ long k;
- TH_TENSOR_APPLY2(real, output, real, input, \
- if(*input_data < -1) \
- *output_data = -1; \
- else if(*input_data <= 1) \
- *output_data = *input_data; \
- else \
- *output_data = 1;)
+
+#pragma omp parallel for private(k)
+ for (k = 0; k < input->size[0]; k++)
+ {
+ real* ptr_output = output_data + k*input->stride[0];
+ real* ptr_input = input_data + k*input->stride[0];
+ long i;
+ for (i = 0; i < input->stride[0]; i++)
+ {
+ if(ptr_input[i] < -1)
+ ptr_output[i] = -1;
+ else if (ptr_input[i] <= 1)
+ ptr_output[i] = ptr_input[i];
+ else
+ ptr_output[i] = 1;
+ }
+ }
+ }
+
return 1;
}
@@ -26,11 +54,41 @@ static int nn_(HardTanh_updateGradInput)(lua_State *L)
THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_(Tensor_id));
THTensor_(resizeAs)(gradInput, input);
- TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \
- if(*input_data < -1 || *input_data > 1) \
- *gradInput_data = 0; \
- else \
- *gradInput_data = *gradOutput_data;);
+
+ if (input->nDimension == 1 ||
+ !THTensor_(isContiguous)(input) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \
+ if(*input_data < -1 || *input_data > 1) \
+ *gradInput_data = 0; \
+ else \
+ *gradInput_data = *gradOutput_data;);
+ }
+ else
+ {
+ real* gradOutput_data = THTensor_(data)(gradOutput);
+ real* gradInput_data = THTensor_(data)(gradInput);
+ real* input_data = THTensor_(data)(input);
+ long k;
+
+#pragma omp parallel for private(k)
+ for (k = 0; k < input->size[0]; k++)
+ {
+ real* ptr_gradOutput = gradOutput_data + k*input->stride[0];
+ real* ptr_gradInput = gradInput_data + k*input->stride[0];
+ real* ptr_input = input_data + k*input->stride[0];
+ long i;
+ for (i = 0; i < input->stride[0]; i++)
+ {
+ if(ptr_input[i] < -1 || ptr_input[i] > 1)
+ ptr_gradInput[i] = 0;
+ else
+ ptr_gradInput[i] = ptr_gradOutput[i];
+ }
+ }
+ }
return 1;
}
diff --git a/generic/SpatialConvolution.c b/generic/SpatialConvolution.c
index e9f2a7b..49ccc8d 100644
--- a/generic/SpatialConvolution.c
+++ b/generic/SpatialConvolution.c
@@ -2,24 +2,9 @@
#define TH_GENERIC_FILE "generic/SpatialConvolution.c"
#else
-static void nn_(convolution_updateOutput_)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, int dH, int dW)
-{
- /* add bias */
- long i;
- THTensor *outn = THTensor_(new)();
- for (i=0; i<bias->size[0]; i++) {
- THTensor_(select)(outn,output,0,i);
- THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
- }
- THTensor_(free)(outn);
-
- /* do convolutions */
- THTensor_(conv2Dmv)(output, 1.0, 1.0, input, weight, dH, dW, "V","X");
-}
-
static int nn_(SpatialConvolution_updateOutput)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
int dW = luaT_getfieldcheckint(L, 1, "dW");
int dH = luaT_getfieldcheckint(L, 1, "dH");
@@ -47,49 +32,59 @@ static int nn_(SpatialConvolution_updateOutput)(lua_State *L)
if (input->nDimension == 3)
{
THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
-/* printf("\n*************\nstochastic\n"); */
-/* printf("no=%d\n",output->nDimension); */
-/* printf("no=%ld,%ld,%ld\n",nOutputPlane,outputHeight,outputWidth); */
-/* printf("ni=%d\n",input->nDimension); */
- nn_(convolution_updateOutput_)(input,output,weight,bias,dH,dW);
-/* printf("stochastic\n");*/
- }
- else
- {
- THTensor_(resize4d)(output, input->size[0], nOutputPlane, outputHeight, outputWidth);
- THTensor *outn = THTensor_(new)();
- THTensor *inpn = THTensor_(new)();
+ /* add bias */
long i;
- for (i=0; i<input->size[0]; i++)
+ /*THTensor *outn = THTensor_(new)();*/
+ real* bias_data = THTensor_(data)(bias);
+ real* output_data = THTensor_(data)(output);
+#pragma omp parallel for private(i)
+ for (i=0; i<bias->size[0]; i++)
{
- THTensor_(select)(outn,output,0,i);
- THTensor_(select)(inpn,input,0,i);
- nn_(convolution_updateOutput_)(inpn,outn,weight,bias,dH,dW);
+ /*THTensor_(select)(outn,output,0,i);*/
+ /*TH_TENSOR_APPLY(real,outn, *outn_data = bias_data[i];);*/
+ real *ptr_output = output_data + i*outputWidth*outputHeight;
+ long j;
+ for(j = 0; j < outputWidth*outputHeight; j++)
+ ptr_output[j] = bias_data[i];
}
- THTensor_(free)(outn);
- THTensor_(free)(inpn);
+ /*THTensor_(free)(outn);*/
+
+ /* do convolutions */
+ THTensor_(conv2Dmv)(output, 1.0, 1.0, input, weight, dH, dW, "V","X");
}
+ else
+ {
+ THTensor_(resize4d)(output, input->size[0], nOutputPlane, outputHeight, outputWidth);
-/* /\* add bias *\/ */
-/* long i; */
-/* THTensor *outn = THTensor_(new)(); */
-/* for (i=0; i<bias->size[0]; i++) { */
-/* THTensor_(select)(outn,output,0,i); */
-/* THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); */
-/* } */
-/* THTensor_(free)(outn); */
+ real* bias_data = THTensor_(data)(bias);
+ real* output_data = THTensor_(data)(output);
-/* /\* do convolutions *\/ */
-/* THTensor_(conv2Dmv)(output, 1.0, 1.0, input, weight, dH, dW, "vx"); */
+ long p;
+#pragma omp parallel for private(p)
+ for (p=0; p<input->size[0]; p++)
+ {
+ /* BIAS */
+ long i;
+ for (i=0; i<bias->size[0]; i++)
+ {
+ real *ptr_output = output_data + p*nOutputPlane*outputWidth*outputHeight + i*outputWidth*outputHeight;
+ long j;
+ for(j = 0; j < outputWidth*outputHeight; j++)
+ ptr_output[j] = bias_data[i];
+ }
+ }
+ /* do convolutions */
+ THTensor_(conv2Dmm)(output, 1.0, 1.0, input, weight, dH, dW, "V","X");
+ }
return 1;
}
static int nn_(SpatialConvolution_updateGradInput)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
- THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
int dW = luaT_getfieldcheckint(L, 1, "dW");
int dH = luaT_getfieldcheckint(L, 1, "dH");
int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
@@ -102,48 +97,18 @@ static int nn_(SpatialConvolution_updateGradInput)(lua_State *L)
/* gradient to input */
THTensor *tweight = THTensor_(newTranspose)(weight,0,1);
- if(input->nDimension == 3)
+ if (input->nDimension == 3)
{
- THTensor_(conv2Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dH, dW, "F", "C");
+ THTensor_(conv2Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dH, dW, "F","C");
}
else
{
-
- THTensor_(resizeAs)(gradInput,input);
- THTensor *outn = THTensor_(new)();
- THTensor *inpn = THTensor_(new)();
- long i;
- for (i=0; i<input->size[0]; i++)
- {
- THTensor_(select)(outn,gradOutput,0,i);
- THTensor_(select)(inpn,gradInput,0,i);
- THTensor_(conv2Dmv)(inpn, 0.0, 1.0, outn, tweight, dH, dW, "F", "C");
- }
- THTensor_(free)(outn);
- THTensor_(free)(inpn);
+ THTensor_(conv2Dmm)(gradInput, 0.0, 1.0, gradOutput, tweight, dH, dW, "F","C");
}
THTensor_(free)(tweight);
-
return 1;
}
-static void nn_(convolution_accGradParameters_)(THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, real scale, int dH, int dW)
-{
- long k;
-
- /* gradient to bias */
- real *gradBias_data = THTensor_(data)(gradBias);
- THTensor* gradOutSlice = THTensor_(new)();
- for(k = 0; k < gradOutput->size[0]; k++)
- {
- THTensor_(select)(gradOutSlice, gradOutput, 0, k);
- gradBias_data[k] += scale*THTensor_(sumall)(gradOutSlice);
- }
- THTensor_(free)(gradOutSlice);
-
- /* gradient to kernels */
- THTensor_(conv2DRevger)(gradWeight, 1.0, scale, input, gradOutput, dH, dW);
-}
static int nn_(SpatialConvolution_accGradParameters)(lua_State *L)
{
@@ -156,28 +121,59 @@ static int nn_(SpatialConvolution_accGradParameters)(lua_State *L)
THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_(Tensor_id));
THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_(Tensor_id));
-
+
THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
- if(input->nDimension == 3)
+ int dimw = 2;
+ int dimh = 1;
+
+ if (input->nDimension == 4)
+ {
+ dimw++;
+ dimh++;
+ }
+
+ /* gradient to bias */
+ real *gradBias_data = THTensor_(data)(gradBias);
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ long noutSlice = gradOutput->size[dimh]*gradOutput->size[dimw];
+ /*THTensor* gradOutSlice = THTensor_(new)();*/
+
+ if (input->nDimension == 3)
{
- nn_(convolution_accGradParameters_)(input,gradOutput,gradWeight,gradBias,scale,dH,dW);
+ long k;
+#pragma omp parallel for private(k)
+ for(k = 0; k < nOutputPlane; k++)
+ {
+ /*THTensor_(select)(gradOutSlice, gradOutput, 0, k);*/
+ real *ptr_gradOutput = gradOutput_data + k*noutSlice;
+ long l;
+ for(l = 0; l < noutSlice; l++)
+ gradBias_data[k] += scale*ptr_gradOutput[l];
+ }
+
+ /* gradient to kernels */
+ THTensor_(conv2DRevger)(gradWeight, 1.0, scale, input, gradOutput, dH, dW);
}
else
{
- THTensor *outn = THTensor_(new)();
- THTensor *inpn = THTensor_(new)();
- long i;
- for (i=0; i<input->size[0]; i++)
+ long k;
+#pragma omp parallel for private(k)
+ for(k = 0; k < nOutputPlane; k++)
{
- THTensor_(select)(outn,gradOutput,0,i);
- THTensor_(select)(inpn,input,0,i);
- nn_(convolution_accGradParameters_)(inpn,outn,gradWeight,gradBias,scale,dH,dW);
+ long p;
+ for(p = 0; p < input->size[0]; p++)
+ {
+ /* BIAS */
+ real *ptr_gradOutput = gradOutput_data + p*nOutputPlane*noutSlice + k*noutSlice;
+ long l;
+ for(l = 0; l < noutSlice; l++)
+ gradBias_data[k] += scale*ptr_gradOutput[l];
+ }
}
- THTensor_(free)(outn);
- THTensor_(free)(inpn);
+ /* gradient to kernels */
+ THTensor_(conv2DRevgerm)(gradWeight, 1.0, scale, input, gradOutput, dH, dW);
}
-
return 0;
}
diff --git a/generic/SpatialConvolutionMap.c b/generic/SpatialConvolutionMap.c
index 2fa11c5..81117f4 100644
--- a/generic/SpatialConvolutionMap.c
+++ b/generic/SpatialConvolutionMap.c
@@ -4,7 +4,7 @@
static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
int kW = luaT_getfieldcheckint(L, 1, "kW");
int kH = luaT_getfieldcheckint(L, 1, "kH");
int dW = luaT_getfieldcheckint(L, 1, "dW");
@@ -22,7 +22,7 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
luaL_argcheck(L, input->size[2] >= kW && input->size[1] >= kH, 2, "input image smaller than kernel size");
THTensor_(resize3d)(output, nOutputPlane,
- (input->size[1] - kH) / dH + 1,
+ (input->size[1] - kH) / dH + 1,
(input->size[2] - kW) / dW + 1);
// contiguous
@@ -33,6 +33,8 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
real *input_data = THTensor_(data)(input);
real *output_data = THTensor_(data)(output);
real *weight_data = THTensor_(data)(weight);
+ real *bias_data = THTensor_(data)(bias);
+ real *connTable_data = THTensor_(data)(connTable);
// and dims
long input_h = input->size[1];
@@ -42,29 +44,32 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
long weight_h = weight->size[1];
long weight_w = weight->size[2];
- // add bias
- THTensor *outputPlane = THTensor_(new)();
- int k;
- for (k = 0; k < nOutputPlane; k++) {
- THTensor_(select)(outputPlane,output,0,k);
- THTensor_(fill)(outputPlane, THTensor_(get1d)(bias, k));
- }
- THTensor_(free)(outputPlane);
-
- // convolve all maps
- int i,o;
- int nweight = connTable->size[0];
- for (k = 0; k < nweight; k++) {
- // get offsets for input/output
- o = (int)THTensor_(get2d)(connTable,k,1)-1;
- i = (int)THTensor_(get2d)(connTable,k,0)-1;
-
- // convolve each map
- THTensor_(validXCorr2Dptr)(output_data + o*output_w*output_h,
- 1.0,
- input_data + i*input_w*input_h, input_h, input_w,
- weight_data + k*weight_w*weight_h, weight_h, weight_w,
- dH, dW);
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nOutputPlane; p++) {
+ // add bias
+ real *ptr_output = output_data + p*output_w*output_h;
+ long j;
+ for(j = 0; j < output_h*output_w; j++)
+ ptr_output[j] = bias_data[p];
+
+ // convolve all maps
+ int nweight = connTable->size[0];
+ long k;
+ for (k = 0; k < nweight; k++) {
+ // get offsets for input/output
+ int o = (int)connTable_data[k*2+1]-1;
+ int i = (int)connTable_data[k*2+0]-1;
+
+ if (o == p)
+ {
+ THTensor_(validXCorr2Dptr)(output_data + o*output_w*output_h,
+ 1.0,
+ input_data + i*input_w*input_h, input_h, input_w,
+ weight_data + k*weight_w*weight_h, weight_h, weight_w,
+ dH, dW);
+ }
+ }
}
// clean up
@@ -76,10 +81,11 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
- THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
int dW = luaT_getfieldcheckint(L, 1, "dW");
int dH = luaT_getfieldcheckint(L, 1, "dH");
+ int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_(Tensor_id));
THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id));
@@ -97,6 +103,7 @@ static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
real *gradInput_data = THTensor_(data)(gradInput);
real *gradOutput_data = THTensor_(data)(gradOutput);
real *weight_data = THTensor_(data)(weight);
+ real *connTable_data = THTensor_(data)(connTable);
// and dims
long input_h = input->size[1];
@@ -106,33 +113,40 @@ static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
long weight_h = weight->size[1];
long weight_w = weight->size[2];
- // updateGradInput all
- int k;
- int nkernel = connTable->size[0];
- for(k = 0; k < nkernel; k++)
- {
- int o = (int)THTensor_(get2d)(connTable,k,1)-1;
- int i = (int)THTensor_(get2d)(connTable,k,0)-1;
-
- // gradient to input
- THTensor_(fullConv2Dptr)(gradInput_data + i*input_w*input_h,
- 1.0,
- gradOutput_data + o*output_w*output_h, output_h, output_w,
- weight_data + k*weight_w*weight_h, weight_h, weight_w,
- dH, dW);
- }
+ long p;
+#pragma omp parallel for private(p)
+ for(p = 0; p < nInputPlane; p++)
+ {
+ long k;
+ // backward all
+ int nkernel = connTable->size[0];
+ for(k = 0; k < nkernel; k++)
+ {
+ int o = (int)connTable_data[k*2+1]-1;
+ int i = (int)connTable_data[k*2+0]-1;
+ if (i == p)
+ {
+ // gradient to input
+ THTensor_(fullConv2Dptr)(gradInput_data + i*input_w*input_h,
+ 1.0,
+ gradOutput_data + o*output_w*output_h, output_h, output_w,
+ weight_data + k*weight_w*weight_h, weight_h, weight_w,
+ dH, dW);
+ }
+ }
+ }
// clean up
THTensor_(free)(gradInput);
THTensor_(free)(gradOutput);
-
+
return 1;
}
static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
- THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
int dW = luaT_getfieldcheckint(L, 1, "dW");
int dH = luaT_getfieldcheckint(L, 1, "dH");
int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
@@ -151,6 +165,7 @@ static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L)
real *input_data = THTensor_(data)(input);
real *gradOutput_data = THTensor_(data)(gradOutput);
real *gradWeight_data = THTensor_(data)(gradWeight);
+ real *gradBias_data = THTensor_(data)(gradBias);
// and dims
long input_h = input->size[1];
@@ -161,29 +176,30 @@ static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L)
long weight_w = weight->size[2];
// gradients wrt bias
- int k;
- THTensor *gradOutputPlane = THTensor_(new)();
- real *gradBias_data = THTensor_(data)(gradBias);
+ long k;
+#pragma omp parallel for private(k)
for(k = 0; k < nOutputPlane; k++) {
- THTensor_(select)(gradOutputPlane, gradOutput, 0, k);
- gradBias_data[k] += scale * THTensor_(sumall)(gradOutputPlane);
+ real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+ long l;
+ for(l = 0; l < output_h*output_w; l++)
+ gradBias_data[k] += scale*ptr_gradOutput[l];
}
- THTensor_(free)(gradOutputPlane);
// gradients wrt weight
int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
for(k = 0; k < nkernel; k++)
- {
- int o = (int)THTensor_(get2d)(connTable,k,1)-1;
- int i = (int)THTensor_(get2d)(connTable,k,0)-1;
-
- // gradient to kernel
- THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h,
- scale,
- input_data + i*input_w*input_h, input_h, input_w,
- gradOutput_data + o*output_w*output_h, output_h, output_w,
- dH, dW);
- }
+ {
+ int o = (int)THTensor_(get2d)(connTable,k,1)-1;
+ int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+
+ // gradient to kernel
+ THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h,
+ scale,
+ input_data + i*input_w*input_h, input_h, input_w,
+ gradOutput_data + o*output_w*output_h, output_h, output_w,
+ dH, dW);
+ }
// clean up
THTensor_(free)(input);
diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c
index 2620530..ce2cdbc 100644
--- a/generic/SpatialMaxPooling.c
+++ b/generic/SpatialMaxPooling.c
@@ -38,6 +38,7 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
// compute max pooling for each input slice
long k;
+#pragma omp parallel private(k)
for (k = 0; k < nslices; k++) {
// pointers to slices
real *input_p = input_data + k*iwidth*iheight;
@@ -56,9 +57,9 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
real *indxp = indx_p + i*owidth + j;
// compute local max:
- long maxindex = -1;
- real maxval = -THInf;
- long tcntr = 0;
+ long maxindex = -1;
+ real maxval = -THInf;
+ long tcntr = 0;
int x,y;
for(y = 0; y < kH; y++) {
for(x = 0; x < kW; x++) {
@@ -130,8 +131,8 @@ static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
for(i = 0; i < oheight; i++) {
for(j = 0; j < owidth; j++) {
// retrieve position of max
- long maxi = *(indy_p + i*owidth + j) - 1 + i*dH;
- long maxj = *(indx_p + i*owidth + j) - 1 + j*dW;
+ long maxi = *(indy_p + i*owidth + j) - 1 + i*dH;
+ long maxj = *(indx_p + i*owidth + j) - 1 + j*dW;
// update gradient
*(gradInput_p + maxi*iwidth + maxj) += *(gradOutput_p + i*owidth + j);
diff --git a/generic/SpatialSubSampling.c b/generic/SpatialSubSampling.c
index e5cbc3b..a1dde21 100644
--- a/generic/SpatialSubSampling.c
+++ b/generic/SpatialSubSampling.c
@@ -4,7 +4,7 @@
static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
int kW = luaT_getfieldcheckint(L, 1, "kW");
int kH = luaT_getfieldcheckint(L, 1, "kH");
int dW = luaT_getfieldcheckint(L, 1, "dW");
@@ -20,12 +20,13 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
real *output_data;
real *input_data;
-
luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
int dimw = 2;
int dimh = 1;
+ long nbatch = 1;
if (input->nDimension == 4) {
+ nbatch = input->size[0];
dimw++;
dimh++;
}
@@ -35,51 +36,42 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
long outputWidth = (inputWidth - kW) / dW + 1;
long outputHeight = (inputHeight - kH) / dH + 1;
-
luaL_argcheck(L, input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
luaL_argcheck(L, inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
- input = THTensor_(newContiguous)(input);
- input_data = THTensor_(data)(input);
-
- long nbatch = 1;
- if (input->nDimension == 3)
- {
+ if (input->nDimension == 3)
THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
- }
else
- {
- nbatch = input->size[0];
- THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
- }
-
+ THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ input_data = THTensor_(data)(input);
output_data = THTensor_(data)(output);
-
- long i, k, p;
-
- for(p = 0; p < nbatch; p++)
+
+ long k;
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
{
- for(k = 0; k < nInputPlane; k++)
+ long p;
+ for(p = 0; p < nbatch; p++)
{
- real *ptr_output;
long xx, yy;
-
+ /* For all output pixels... */
+ real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
/* Get the good mask for (k,i) (k out, i in) */
real the_weight = weight_data[k];
-
/* Initialize to the bias */
real z = bias_data[k];
+ long i;
for(i = 0; i < outputWidth*outputHeight; i++)
- output_data[i] = z;
+ ptr_output[i] = z;
- /* For all output pixels... */
- ptr_output = output_data;
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
- /* Compute the mean of the input image... */
- real *ptr_input = input_data+yy*dH*inputWidth+xx*dW;
+ // Compute the mean of the input image...
+ real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
real sum = 0;
long kx, ky;
@@ -87,20 +79,14 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
{
for(kx = 0; kx < kW; kx++)
sum += ptr_input[kx];
- ptr_input += inputWidth; /* next input line */
+ ptr_input += inputWidth; // next input line
}
-
- /* Update output */
+ // Update output
*ptr_output++ += the_weight*sum;
}
}
-
- /* Next input/output plane */
- output_data += outputWidth*outputHeight;
- input_data += inputWidth*inputHeight;
}
}
-
THTensor_(free)(input);
return 1;
@@ -108,8 +94,8 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
- THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
int kW = luaT_getfieldcheckint(L, 1, "kW");
int kH = luaT_getfieldcheckint(L, 1, "kH");
int dW = luaT_getfieldcheckint(L, 1, "dW");
@@ -118,14 +104,14 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id));
THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_(Tensor_id));
-
+
int dimw = 2;
int dimh = 1;
long nbatch = 1;
if (input->nDimension == 4) {
+ nbatch = input->size[0];
dimw++;
dimh++;
- nbatch = input->size[0];
}
long inputWidth = input->size[dimw];
@@ -135,41 +121,46 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
real *weight_data = THTensor_(data)(weight);
real *gradOutput_data = THTensor_(data)(gradOutput);
- real *gradInput_data;
+ real *input_data, *gradInput_data;
+
+ input_data = THTensor_(data)(input);
THTensor_(resizeAs)(gradInput, input);
- THTensor_(zero)(gradInput);
gradInput_data = THTensor_(data)(gradInput);
gradOutput_data = THTensor_(data)(gradOutput);
- long k, p;
-
- for(p = 0; p < nbatch; p++)
+ long k;
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
{
- for(k = 0; k < nInputPlane; k++)
+ long p;
+ for(p = 0; p < nbatch; p++)
{
real the_weight = weight_data[k];
- real *ptr_gradOutput = gradOutput_data;
+ real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
long xx, yy;
-
+
+ real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+ long i;
+ for(i=0; i<inputWidth*inputHeight; i++)
+ ptr_gi[i] = 0.0;
+
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
- real *ptr_gradInput = gradInput_data+yy*dH*inputWidth+xx*dW;
+ real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
real z = *ptr_gradOutput++ * the_weight;
long kx, ky;
-
+
for(ky = 0; ky < kH; ky++)
{
for(kx = 0; kx < kW; kx++)
ptr_gradInput[kx] += z;
ptr_gradInput += inputWidth;
- }
+ }
}
}
- gradOutput_data += outputWidth*outputHeight;
- gradInput_data += inputWidth*inputHeight;
}
}
@@ -178,8 +169,8 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
{
- THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
- THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
+ THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
+ THTensor *gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id));
real scale = luaL_optnumber(L, 4, 1);
int kW = luaT_getfieldcheckint(L, 1, "kW");
int kH = luaT_getfieldcheckint(L, 1, "kH");
@@ -189,10 +180,10 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_(Tensor_id));
THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_(Tensor_id));
-
- int dimw = 2;
- int dimh = 1;
+
long nbatch = 1;
+ long dimw = 2;
+ long dimh = 1;
if (input->nDimension == 4) {
dimw++;
dimh++;
@@ -212,18 +203,21 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
input = THTensor_(newContiguous)(input);
input_data = THTensor_(data)(input);
- long i, k, p;
- for(p = 0; p < nbatch; p++)
+ long k;
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
{
- for(k = 0; k < nInputPlane; k++)
+ long p;
+ for(p = 0; p < nbatch; p++)
{
- real *ptr_gradOutput = gradOutput_data;
+ real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
real sum;
long xx, yy;
sum = 0;
+ long i;
for(i = 0; i < outputWidth*outputHeight; i++)
- sum += gradOutput_data[i];
+ sum += ptr_gradOutput[i];
gradBias_data[k] += scale*sum;
sum = 0;
@@ -231,7 +225,7 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
{
for(xx = 0; xx < outputWidth; xx++)
{
- real *ptr_input = input_data+yy*dH*inputWidth+xx*dW;
+ real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
real z = *ptr_gradOutput++;
long kx, ky;
@@ -240,16 +234,13 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
for(kx = 0; kx < kW; kx++)
sum += z * ptr_input[kx];
ptr_input += inputWidth;
- }
+ }
}
}
gradWeight_data[k] += scale*sum;
- gradOutput_data += outputWidth*outputHeight;
- input_data += inputWidth*inputHeight;
}
}
-
THTensor_(free)(input);
return 0;
diff --git a/generic/Sqrt.c b/generic/Sqrt.c
index 3e0c3d9..952c260 100644
--- a/generic/Sqrt.c
+++ b/generic/Sqrt.c
@@ -9,10 +9,30 @@ static int nn_(Sqrt_updateOutput)(lua_State *L)
THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id));
THTensor_(resizeAs)(output, input);
+
+ if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+ {
+ TH_TENSOR_APPLY2(real, output, real, input, \
+ *output_data = sqrt(*input_data + bias););
+ }
+ else
+ {
+ real* output_data = THTensor_(data)(output);
+ real* input_data = THTensor_(data)(input);
+ long k;
- TH_TENSOR_APPLY2(real, output, real, input, \
- *output_data = sqrt(*input_data + bias););
-
+#pragma omp parallel for private(k)
+ for (k = 0; k < input->size[0]; k++)
+ {
+ real* ptr_output = output_data + k*input->stride[0];
+ real* ptr_input = input_data + k*input->stride[0];
+ long i;
+ for (i = 0; i < input->stride[0]; i++)
+ {
+ ptr_output[i] = sqrt(ptr_input[i] + bias);
+ }
+ }
+ }
return 1;
}
@@ -25,9 +45,34 @@ static int nn_(Sqrt_updateGradInput)(lua_State *L)
THTensor_(resizeAs)(gradInput, input);
- TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
- *gradInput_data = 0.5 * (*gradOutput_data / *output_data););
-
+ if (output->nDimension == 1 ||
+ !THTensor_(isContiguous)(output) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
+ *gradInput_data = 0.5 * (*gradOutput_data / *output_data););
+ }
+ else
+ {
+ real* gradOutput_data = THTensor_(data)(gradOutput);
+ real* gradInput_data = THTensor_(data)(gradInput);
+ real* output_data = THTensor_(data)(output);
+ long k;
+
+#pragma omp parallel for private(k)
+ for (k = 0; k < output->size[0]; k++)
+ {
+ real* ptr_gradOutput = gradOutput_data + k*output->stride[0];
+ real* ptr_gradInput = gradInput_data + k*output->stride[0];
+ real* ptr_output = output_data + k*output->stride[0];
+ long i;
+ for (i = 0; i < output->stride[0]; i++)
+ {
+ ptr_gradInput[i] = 0.5 * (ptr_gradOutput[i] / ptr_output[i]);
+ }
+ }
+ }
return 1;
}
diff --git a/generic/Square.c b/generic/Square.c
index 409055d..97baee3 100644
--- a/generic/Square.c
+++ b/generic/Square.c
@@ -6,12 +6,32 @@ static int nn_(Square_updateOutput)(lua_State *L)
{
THTensor *input = luaT_checkudata(L, 2, torch_(Tensor_id));
THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id));
-
- THTensor_(resizeAs)(output, input);
- TH_TENSOR_APPLY2(real, output, real, input, \
- *output_data = *input_data * *input_data;);
+ THTensor_(resizeAs)(output, input);
+
+ if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+ {
+ TH_TENSOR_APPLY2(real, output, real, input, \
+ *output_data = (*input_data) * (*input_data););
+ }
+ else
+ {
+ real* output_data = THTensor_(data)(output);
+ real* input_data = THTensor_(data)(input);
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < input->size[0]; k++)
+ {
+ real* ptr_output = output_data + k*input->stride[0];
+ real* ptr_input = input_data + k*input->stride[0];
+ long i;
+ for (i = 0; i < input->stride[0]; i++)
+ {
+ ptr_output[i] = ptr_input[i]*ptr_input[i];
+ }
+ }
+ }
return 1;
}
@@ -23,9 +43,34 @@ static int nn_(Square_updateGradInput)(lua_State *L)
THTensor_(resizeAs)(gradInput, input);
- TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \
- *gradInput_data = 2.0 * (*gradOutput_data) * (*input_data););
+ if (input->nDimension == 1 ||
+ !THTensor_(isContiguous)(input) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \
+ *gradInput_data = (*gradOutput_data) * (*input_data););
+ }
+ else
+ {
+ real* gradOutput_data = THTensor_(data)(gradOutput);
+ real* gradInput_data = THTensor_(data)(gradInput);
+ real* input_data = THTensor_(data)(input);
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < input->size[0]; k++)
+ {
+ real* ptr_gradOutput = gradOutput_data + k*input->stride[0];
+ real* ptr_gradInput = gradInput_data + k*input->stride[0];
+ real* ptr_input = input_data + k*input->stride[0];
+ long i;
+ for (i = 0; i < input->stride[0]; i++)
+ {
+ ptr_gradInput[i] = 2.0 * ptr_gradOutput[i] * ptr_input[i];
+ }
+ }
+ }
return 1;
}
diff --git a/generic/Tanh.c b/generic/Tanh.c
index 5c24d15..01e9bc0 100644
--- a/generic/Tanh.c
+++ b/generic/Tanh.c
@@ -9,9 +9,28 @@ static int nn_(Tanh_updateOutput)(lua_State *L)
THTensor_(resizeAs)(output, input);
- TH_TENSOR_APPLY2(real, output, real, input, \
- *output_data = tanh(*input_data);)
+ if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+ {
+ TH_TENSOR_APPLY2(real, output, real, input, \
+ *output_data = tanh(*input_data););
+ }
+ else
+ {
+ real* output_data = THTensor_(data)(output);
+ real* input_data = THTensor_(data)(input);
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < input->size[0]; k++)
+ {
+ real* ptr_output = output_data + k*input->stride[0];
+ real* ptr_input = input_data + k*input->stride[0];
+ long i;
+ for (i = 0; i < input->stride[0]; i++)
+ ptr_output[i] = tanh(ptr_input[i]);
+ }
+ }
+
return 1;
}
@@ -22,9 +41,37 @@ static int nn_(Tanh_updateGradInput)(lua_State *L)
THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_(Tensor_id));
THTensor_(resizeAs)(gradInput, output);
- TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
- real z = *output_data; \
- *gradInput_data = *gradOutput_data * (1. - z*z););
+
+ if (output->nDimension == 1 ||
+ !THTensor_(isContiguous)(output) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
+ real z = *output_data; \
+ *gradInput_data = *gradOutput_data * (1. - z*z););
+ }
+ else
+ {
+ real* gradOutput_data = THTensor_(data)(gradOutput);
+ real* gradInput_data = THTensor_(data)(gradInput);
+ real* output_data = THTensor_(data)(output);
+ long k;
+
+#pragma omp parallel for private(k)
+ for (k = 0; k < output->size[0]; k++)
+ {
+ real* ptr_gradOutput = gradOutput_data + k*output->stride[0];
+ real* ptr_gradInput = gradInput_data + k*output->stride[0];
+ real* ptr_output = output_data + k*output->stride[0];
+ long i;
+ for (i = 0; i < output->stride[0]; i++)
+ {
+ real z = ptr_output[i];
+ ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+ }
+ }
+ }
return 1;
}