diff --git a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp index 7bb1da751f..7a2799fa37 100644 --- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp @@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ uchar saturate_cast(schar v) { - return (uchar) ::max((int)v, 0); + uint res = 0; + int vi = v; + asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uchar saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(ushort v) { - return (uchar) ::min((uint)v, (uint)UCHAR_MAX); + uint res = 0; + asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(int v) { - return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); + uint res = 0; + asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(uint v) { - return (uchar) ::min(v, (uint)UCHAR_MAX); + uint res = 0; + asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ uchar saturate_cast(short v) - { - return saturate_cast((uint)v); - } - template<> __device__ __forceinline__ uchar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ schar saturate_cast(uchar v) { - return (schar) ::min((int)v, SCHAR_MAX); + uint res = 0; + uint vi = v; + asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(ushort v) + template<> __device__ __forceinline__ schar saturate_cast(short v) { - return (schar) ::min((uint)v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(int v) + template<> __device__ __forceinline__ schar saturate_cast(ushort v) { - return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); + uint res = 0; + asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(short v) + template<> __device__ __forceinline__ schar saturate_cast(int v) { - return saturate_cast((int)v); + uint res = 0; + asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(uint v) { - return (schar) ::min(v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ ushort saturate_cast(schar v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + int vi = v; + asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(short v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(int v) { - return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); + ushort res = 0; + asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(uint v) { - return (ushort) ::min(v, (uint)USHRT_MAX); + ushort res = 0; + asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + ushort res = 0; + asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + ushort res = 0; + asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ short saturate_cast(ushort v) { - return (short) ::min((int)v, SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(int v) { - return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); + short res = 0; + asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(uint v) { - return (short) ::min(v, (uint)SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + short res = 0; + asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + short res = 0; + asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif } + template<> __device__ __forceinline__ int saturate_cast(uint v) + { + int res = 0; + asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); @@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device #endif } + template<> __device__ __forceinline__ uint saturate_cast(schar v) + { + uint res = 0; + int vi = v; + asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(int v) + { + uint res = 0; + asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ uint saturate_cast(float v) { return __float2uint_rn(v);