1 files changed, 2 insertions, 2 deletions
diff --git a/lib/THCUNN/BatchNormalization.cu b/lib/THCUNN/BatchNormalization.cu
index 125e3ff..e6717c7 100644
--- a/lib/THCUNN/BatchNormalization.cu
+++ b/lib/THCUNN/BatchNormalization.cu
@@ -5,7 +5,7 @@
 
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
-
+#include "THCDeviceUtils.cuh"
 const int WARP_SIZE = 32;
 
 // The maximum number of threads in a block
@@ -80,7 +80,7 @@ template <typename T>
 static __device__ __forceinline__ T warpSum(T val) {
 #if __CUDA_ARCH__ >= 300
   for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-    val += __shfl_xor(val, 1 << i, WARP_SIZE);
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
   }
 #else
   __shared__ T values[MAX_BLOCK_SIZE];