1 files changed, 46 insertions, 0 deletions
diff --git a/lib/THCUNN/FusedRNNKernel.cu b/lib/THCUNN/FusedRNNKernel.cu
new file mode 100644
index 0000000..6a65d3e
--- /dev/null
+++ b/lib/THCUNN/FusedRNNKernel.cu
@@ -0,0 +1,46 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCNumerics.cuh"
+#include <THC/THCApply.cuh>
+
+template <typename T>
+struct TensorSigmoidOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) const {
+    T one = (T) 1.0;
+    *out = one / (one + THCNumerics<T>::exp(- *in));
+  }
+
+  __device__ __forceinline__ void operator()(T* v) const {
+    T one = (T) 1.0;
+    *v = one / (one + THCNumerics<T>::exp(- *v));
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSigmoidOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+#else
+    float fin = __half2float(*in);
+    *out = __float2half(1.0f / (1.0f + expf(- fin)));
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* v) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    half one = ScalarConvert<int, half>::to(1);
+    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+#else
+    float fv = __half2float(*v);
+    *v = __float2half(1.0f / (1.0f + expf(- fv)));
+#endif
+  }
+};
+#endif
+
+#include "generic/FusedRNNKernel.cu"
+#include "THCGenerateFloatTypes.h"