Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Luehr <nluehr@nvidia.com>2016-01-22 02:23:28 +0300
committerPrzemek Tredak <ptredak@nvidia.com>2016-01-29 00:38:18 +0300
commitfe1a9567155c8965425f884c552496ebd960ff42 (patch)
tree9bf29310fa93e9bba453fde89cf8e82649e0a4d4
parentc05312f1517bab34586bb74a37bb60dedf097d30 (diff)
Enabled support for char type to be unsigned.
GCC on POWER arch defines char type as unsigned. Change-Id: Ic143cb058fe42414b1f6f1f45b02132c837726ae Reviewed-on: http://git-master/r/999614 Reviewed-by: Przemek Tredak <ptredak@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
-rw-r--r--src/reduce_kernel.h41
1 files changed, 30 insertions, 11 deletions
diff --git a/src/reduce_kernel.h b/src/reduce_kernel.h
index a35dc30..741798d 100644
--- a/src/reduce_kernel.h
+++ b/src/reduce_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,6 +31,7 @@
#define REDUCE_KERNEL_H_
#include "common_kernel.h"
+#include <limits>
template<typename T>
struct FuncSum {
@@ -135,14 +136,23 @@ struct FuncMax<char> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
- asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+ if (std::numeric_limits<char>::is_signed)
+ asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+ else
+ asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500)
int32_t rv;
- asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
- "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+ if (std::numeric_limits<char>::is_signed)
+ asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
+ "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+ "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+ "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+ else
+ asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
+ "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+ "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+ "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;
@@ -166,14 +176,23 @@ struct FuncMin<char> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
- asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+ if (std::numeric_limits<char>::is_signed)
+ asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+ else
+ asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#elif (__CUDA_ARCH__ >= 500)
int32_t rv;
- asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
- "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+ if (std::numeric_limits<char>::is_signed)
+ asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
+ "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+ "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+ "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+ else
+ asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
+ "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+ "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+ "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
return rv;
#else
converter cx, cy, cr;