diff options
author | Nathan Luehr <nluehr@nvidia.com> | 2016-01-22 02:23:28 +0300 |
---|---|---|
committer | Przemek Tredak <ptredak@nvidia.com> | 2016-01-29 00:38:18 +0300 |
commit | fe1a9567155c8965425f884c552496ebd960ff42 (patch) | |
tree | 9bf29310fa93e9bba453fde89cf8e82649e0a4d4 | |
parent | c05312f1517bab34586bb74a37bb60dedf097d30 (diff) |
Enabled support for char type to be unsigned.
GCC on POWER arch defines char type as unsigned.
Change-Id: Ic143cb058fe42414b1f6f1f45b02132c837726ae
Reviewed-on: http://git-master/r/999614
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
-rw-r--r-- | src/reduce_kernel.h | 41 |
1 files changed, 30 insertions, 11 deletions
diff --git a/src/reduce_kernel.h b/src/reduce_kernel.h index a35dc30..741798d 100644 --- a/src/reduce_kernel.h +++ b/src/reduce_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,6 +31,7 @@ #define REDUCE_KERNEL_H_ #include "common_kernel.h" +#include <limits> template<typename T> struct FuncSum { @@ -135,14 +136,23 @@ struct FuncMax<char> { __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; - asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); + if (std::numeric_limits<char>::is_signed) + asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); + else + asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; #elif (__CUDA_ARCH__ >= 500) int32_t rv; - asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); + if (std::numeric_limits<char>::is_signed) + asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" + "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" + "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" + "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); + else + asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" + "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" + "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" + "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); return rv; #else converter cx, cy, cr; @@ -166,14 +176,23 @@ struct FuncMin<char> { __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; - asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); + if (std::numeric_limits<char>::is_signed) + asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); + else + asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; #elif (__CUDA_ARCH__ >= 500) int32_t rv; - asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); + if (std::numeric_limits<char>::is_signed) + asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" + "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" + "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" + "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); + else + asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" + "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" + "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" + "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); return rv; #else converter cx, cy, cr; |