fixed cvtColorGPU on linux

pull/13383/head
Andrey Morozov 14 years ago
parent 2a0909acfd
commit fea66d9384
  1. 91
      modules/gpu/src/cuda/color.cu

@ -65,7 +65,7 @@ namespace imgproc
static __device__ unsigned char half() { return (unsigned char)(max()/2 + 1); }
};
template<> struct ColorChannel<ushort>
template<> struct ColorChannel<unsigned short>
{
typedef float worktype_f;
typedef ushort3 vec3_t;
@ -226,7 +226,7 @@ namespace cv { namespace gpu { namespace impl
RGB2RGB_caller(src, srccn, dst, dstcn, bidx, stream);
}
void RGB2RGB_gpu(const DevMem2D_<ushort>& src, int srccn, const DevMem2D_<ushort>& dst, int dstcn, int bidx, cudaStream_t stream)
void RGB2RGB_gpu(const DevMem2D_<unsigned short>& src, int srccn, const DevMem2D_<unsigned short>& dst, int dstcn, int bidx, cudaStream_t stream)
{
RGB2RGB_caller(src, srccn, dst, dstcn, bidx, stream);
}
@ -254,7 +254,7 @@ namespace cv { namespace gpu { namespace impl
// if( greenBits == 6 )
// for( int i = 0; i < n; i++, dst += dcn )
// {
// unsigned t = ((const ushort*)src)[i];
// unsigned t = ((const unsigned short*)src)[i];
// dst[bidx] = (uchar)(t << 3);
// dst[1] = (uchar)((t >> 3) & ~3);
// dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
@ -264,7 +264,7 @@ namespace cv { namespace gpu { namespace impl
// else
// for( int i = 0; i < n; i++, dst += dcn )
// {
// unsigned t = ((const ushort*)src)[i];
// unsigned t = ((const unsigned short*)src)[i];
// dst[bidx] = (uchar)(t << 3);
// dst[1] = (uchar)((t >> 2) & ~7);
// dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
@ -290,17 +290,17 @@ namespace cv { namespace gpu { namespace impl
// if( greenBits == 6 )
// for( int i = 0; i < n; i++, src += scn )
// {
// ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
// ((unsigned short*)dst)[i] = (unsigned short)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
// }
// else if( scn == 3 )
// for( int i = 0; i < n; i++, src += 3 )
// {
// ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
// ((unsigned short*)dst)[i] = (unsigned short)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
// }
// else
// for( int i = 0; i < n; i++, src += 4 )
// {
// ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
// ((unsigned short*)dst)[i] = (unsigned short)((src[bidx] >> 3)|((src[1]&~7) << 2)|
// ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
// }
// }
@ -364,13 +364,13 @@ namespace imgproc
// for( int i = 0; i < n; i++ )
// {
// int t = src[i];
// ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
// ((unsigned short*)dst)[i] = (unsigned short)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
// }
// else
// for( int i = 0; i < n; i++ )
// {
// int t = src[i] >> 3;
// ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
// ((unsigned short*)dst)[i] = (unsigned short)(t|(t << 5)|(t << 10));
// }
// }
// int greenBits;
@ -410,7 +410,7 @@ namespace cv { namespace gpu { namespace impl
Gray2RGB_caller(src, dst, dstcn, stream);
}
void Gray2RGB_gpu(const DevMem2D_<ushort>& src, const DevMem2D_<ushort>& dst, int dstcn, cudaStream_t stream)
void Gray2RGB_gpu(const DevMem2D_<unsigned short>& src, const DevMem2D_<unsigned short>& dst, int dstcn, cudaStream_t stream)
{
Gray2RGB_caller(src, dst, dstcn, stream);
}
@ -449,7 +449,7 @@ namespace imgproc
// if( greenBits == 6 )
// for( int i = 0; i < n; i++ )
// {
// int t = ((ushort*)src)[i];
// int t = ((unsigned short*)src)[i];
// dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
// ((t >> 3) & 0xfc)*G2Y +
// ((t >> 8) & 0xf8)*R2Y, yuv_shift);
@ -457,7 +457,7 @@ namespace imgproc
// else
// for( int i = 0; i < n; i++ )
// {
// int t = ((ushort*)src)[i];
// int t = ((unsigned short*)src)[i];
// dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
// ((t >> 2) & 0xf8)*G2Y +
// ((t >> 7) & 0xf8)*R2Y, yuv_shift);
@ -501,7 +501,7 @@ namespace imgproc
}
}
__global__ void RGB2Gray_3(const ushort* src_, size_t src_step, ushort* dst_, size_t dst_step, int rows, int cols, int bidx)
__global__ void RGB2Gray_3(const unsigned short* src_, size_t src_step, unsigned short* dst_, size_t dst_step, int rows, int cols, int bidx)
{
const int cr = 4899;
const int cg = 9617;
@ -513,16 +513,16 @@ namespace imgproc
if (y < rows && x < cols)
{
const ushort* src = src_ + y * src_step + x * 3;
const unsigned short* src = src_ + y * src_step + x * 3;
ushort t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
unsigned short t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
ushort2 dst;
dst.x = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
dst.x = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
src += 3;
t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
dst.y = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
dst.y = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
*(ushort2*)(dst_ + y * dst_step + x) = dst;
}
@ -581,7 +581,7 @@ namespace imgproc
}
}
__global__ void RGB2Gray_4(const ushort* src_, size_t src_step, ushort* dst_, size_t dst_step, int rows, int cols, int bidx)
__global__ void RGB2Gray_4(const unsigned short* src_, size_t src_step, unsigned short* dst_, size_t dst_step, int rows, int cols, int bidx)
{
const int cr = 4899;
const int cg = 9617;
@ -595,14 +595,14 @@ namespace imgproc
{
ushort4 src = *(ushort4*)(src_ + y * src_step + (x << 2));
ushort t0 = ((ushort*)(&src))[bidx], t1 = src.y, t2 = ((ushort*)(&src))[bidx ^ 2];
unsigned short t0 = ((unsigned short*)(&src))[bidx], t1 = src.y, t2 = ((unsigned short*)(&src))[bidx ^ 2];
ushort2 dst;
dst.x = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
dst.x = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
src = *(ushort4*)(src_ + y * src_step + (x << 2) + 4);
t0 = ((ushort*)(&src))[bidx], t1 = src.y, t2 = ((ushort*)(&src))[bidx ^ 2];
dst.y = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
t0 = ((unsigned short*)(&src))[bidx], t1 = src.y, t2 = ((unsigned short*)(&src))[bidx ^ 2];
dst.y = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
*(ushort2*)(dst_ + y * dst_step + x) = dst;
}
@ -654,7 +654,7 @@ namespace cv { namespace gpu { namespace impl
cudaSafeCall( cudaThreadSynchronize() );
}
void RGB2Gray_gpu(const DevMem2D_<ushort>& src, int srccn, const DevMem2D_<ushort>& dst, int bidx, cudaStream_t stream)
void RGB2Gray_gpu(const DevMem2D_<unsigned short>& src, int srccn, const DevMem2D_<unsigned short>& dst, int bidx, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
@ -665,10 +665,10 @@ namespace cv { namespace gpu { namespace impl
switch (srccn)
{
case 3:
imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(ushort), dst.ptr, dst.step / sizeof(ushort), src.rows, src.cols, bidx);
imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(unsigned short), dst.ptr, dst.step / sizeof(unsigned short), src.rows, src.cols, bidx);
break;
case 4:
imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(ushort), dst.ptr, dst.step / sizeof(ushort), src.rows, src.cols, bidx);
imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(unsigned short), dst.ptr, dst.step / sizeof(unsigned short), src.rows, src.cols, bidx);
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
@ -1472,13 +1472,13 @@ namespace cv { namespace gpu { namespace impl
//static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
//static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
//
//static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
//static unsigned short sRGBGammaTab_b[256], linearGammaTab_b[256];
//#undef lab_shift
//#define lab_shift xyz_shift
//#define gamma_shift 3
//#define lab_shift2 (lab_shift + gamma_shift)
//#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
//static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
//static unsigned short LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
//
//static void initLabTabs()
//{
@ -1507,14 +1507,14 @@ namespace cv { namespace gpu { namespace impl
// for(i = 0; i < 256; i++)
// {
// float x = i*(1.f/255.f);
// sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4)));
// linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
// sRGBGammaTab_b[i] = saturate_cast<unsigned short>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4)));
// linearGammaTab_b[i] = (unsigned short)(i*(1 << gamma_shift));
// }
//
// for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
// {
// float x = i*(1.f/(255.f*(1 << gamma_shift)));
// LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
// LabCbrtTab_b[i] = saturate_cast<unsigned short>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
// }
// initialized = true;
// }
@ -1554,7 +1554,7 @@ namespace cv { namespace gpu { namespace impl
// {
// const int Lscale = (116*255+50)/100;
// const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
// const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
// const unsigned short* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
// int i, scn = srccn;
// int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
// C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
@ -2103,8 +2103,8 @@ namespace cv { namespace gpu { namespace impl
// const int brows = 3, bcn = 7;
// int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;
// int i, bufstep = N7*bcn;
// cv::AutoBuffer<ushort> _buf(bufstep*brows);
// ushort* buf = (ushort*)_buf;
// cv::AutoBuffer<unsigned short> _buf(bufstep*brows);
// unsigned short* buf = (unsigned short*)_buf;
//
// bayer += bstep*2;
//
@ -2120,7 +2120,7 @@ namespace cv { namespace gpu { namespace impl
//
// for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ )
// {
// ushort* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
// unsigned short* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
// srow = bayer + (y+dy)*bstep + 1;
//
// for( i = 0; i < bcn; i++ )
@ -2179,25 +2179,25 @@ namespace cv { namespace gpu { namespace impl
//
// for( ; i < N-1; i++, srow++, brow++ )
// {
// brow[0] = (ushort)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
// brow[0] = (unsigned short)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
// std::abs(srow[-bstep] - srow[+bstep])*2 +
// std::abs(srow[1-bstep] - srow[1+bstep]));
// brow[N] = (ushort)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
// brow[N] = (unsigned short)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
// std::abs(srow[-1] - srow[1])*2 +
// std::abs(srow[-1+bstep] - srow[1+bstep]));
// brow[N2] = (ushort)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
// brow[N3] = (ushort)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
// brow[N4] = (ushort)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
// brow[N2] = (unsigned short)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
// brow[N3] = (unsigned short)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
// brow[N4] = (unsigned short)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
// std::abs(srow[+bstep] - srow[1]));
// brow[N5] = (ushort)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
// brow[N5] = (unsigned short)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
// std::abs(srow[+bstep] - srow[-1]));
// brow[N6] = (ushort)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
// brow[N6] = (unsigned short)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
// }
// }
//
// const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
// const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
// const ushort* brow2 = buf + (y % brows)*bufstep + 2;
// const unsigned short* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
// const unsigned short* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
// const unsigned short* brow2 = buf + (y % brows)*bufstep + 2;
// static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f };
// srow = bayer + y*bstep + 2;
// bool greenCell = greenCell0;
@ -2380,8 +2380,7 @@ namespace cv { namespace gpu { namespace impl
// z = _mm_setzero_si128();
// __m128 _0_5 = _mm_set1_ps(0.5f);
//
// #define _mm_merge_epi16(a, b) \
// _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask))
// #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask))
// #define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))
// #define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))
//

Loading…
Cancel
Save