KinectFusion big update: OpenCL support, etc. (#1798)
* KinFu demo: idle mode added * undistort added * KinFu demo: depth recorder added * TSDFVolume gets voxelSize, voxelSizeInv, truncDist members; decorative fixes * TSDFVolumeGPU::integrate(): host code compiles * TSDFVolume: truncDist fixed * TSDFVolume::integrate(): initial OCL version is done * TSDFVolume::integrate(): OCL: minor fixes * kinfu: small fixes * TSDFVolume::raycast(): initial GPU version is done * USE_INTRINSICS directive for centralized enable/disable opt. code * TSDF Volume supports 3 different sizes/resolutions on each dimension * TSDFVolume: serviceMembers moved to parent class * TSDFVolumeGPU: mem order changed, gave perf boost 4x * Frame: fixed UMat as InputArray; TSDF: comments, TODOs, minor fixes * Frame::getPointsNormals(); minors * FrameGPU: initial version (not working) * minor * FrameGPU: several fixes * added OCL "fast" options * ICP OCL: initial slow version is done (host-side reduce) * ICP OCL: reduce is done, buggy * KinFu Frame: more args fixes * ICP OCL: small fixes to kernel * ICP OCL reduce works * OCL code refactored * ICP OCL: less allocations, better speed * ICP OCL: pose matrix made arg * Render OCL: small fix * Demo: using UMats everywhere * TSDF integrate OCL: put const arg into kernel arg * Platform parameter partially removed, implementation choice is done through OCL availability check * Frame and FrameGenerator removed (other code is in following commits) * CPU render: 4b instead of 3b * ICP: no Frame class use, one class for both CPU and GPU code * demo: fix for UMat chain * TSDF: no Frame or FrameGenerator use * KinFu: no Frame or FrameGenerator, parametrized for Mat or UMat based on OCL availability * KinFu::setParams() removed since it has no effect anyway * TSDF::raycast OCL: fixed normals rendering * ScopeTime -> CV_TRACE * 3-dims resolution and size passed to API * fixed unexpected fails of ICP OCL * voxels made cubic again * args fixed a little * fixed volSize calculation * Tests: inequal, OCL, unified scene test function * kinfu_frame: input types fixed * fixed for Intel HD Graphics * KinFu demo: setUseOptimized instead of setUseOpenCL * tsdf: data types fixed * TSDF OCL: fetch normals implemented * roundDownPow2 -> utils.hpp * TSDF OCL: fetch points+normals implemented * TSDF OCL: NoSize, other fixes for kernel args * Frame OCL: HAVE_OPENCL, NoSize, other kernel args fixed * ICP OCL: HAVE_OPENCL, NoSize and other kernel fixes * KinFu demo fixes: volume size and too long delay * whitespace fix * TSDF: allowed sizes not divisable by 32 * TSDF: fixed type traits; added optimization TODOs * KinFu made non-free * minor fixes: cast and whitespace * fixed FastICP test * warnings fixed: implicit type conversions * OCL kernels: local args made through KernelArg::Local(lsz) call * MSVC warnings fixed * a workaround for broken OCV's bilateral * KinFu tests made a bit faster * build fixed until 3.4 isn't merged to master * warnings fixed, test time shortenedpull/1822/head
parent
2f014f6f40
commit
75bcd397af
20 changed files with 2906 additions and 794 deletions
@ -0,0 +1,239 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html |
||||
|
||||
// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory |
||||
|
||||
#define UTSIZE 27 |
||||
|
||||
typedef float4 ptype; |
||||
|
||||
/* |
||||
Calculate an upper triangle of Ab matrix then reduce it across workgroup |
||||
*/ |
||||
|
||||
inline void calcAb7(__global const char * oldPointsptr, |
||||
int oldPoints_step, int oldPoints_offset, |
||||
__global const char * oldNormalsptr, |
||||
int oldNormals_step, int oldNormals_offset, |
||||
const int2 oldSize, |
||||
__global const char * newPointsptr, |
||||
int newPoints_step, int newPoints_offset, |
||||
__global const char * newNormalsptr, |
||||
int newNormals_step, int newNormals_offset, |
||||
const int2 newSize, |
||||
const float16 poseMatrix, |
||||
const float2 fxy, |
||||
const float2 cxy, |
||||
const float sqDistanceThresh, |
||||
const float minCos, |
||||
float* ab7 |
||||
) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if(x >= newSize.x || y >= newSize.y) |
||||
return; |
||||
|
||||
// coord-independent constants |
||||
|
||||
const float3 poseRot0 = poseMatrix.s012; |
||||
const float3 poseRot1 = poseMatrix.s456; |
||||
const float3 poseRot2 = poseMatrix.s89a; |
||||
const float3 poseTrans = poseMatrix.s37b; |
||||
|
||||
const float2 oldEdge = (float2)(oldSize.x - 1, oldSize.y - 1); |
||||
|
||||
__global const ptype* newPtsRow = (__global const ptype*)(newPointsptr + |
||||
newPoints_offset + |
||||
y*newPoints_step); |
||||
|
||||
__global const ptype* newNrmRow = (__global const ptype*)(newNormalsptr + |
||||
newNormals_offset + |
||||
y*newNormals_step); |
||||
|
||||
float3 newP = newPtsRow[x].xyz; |
||||
float3 newN = newNrmRow[x].xyz; |
||||
|
||||
if(any(isnan(newP)) || any(isnan(newN))) |
||||
return; |
||||
|
||||
//transform to old coord system |
||||
newP = (float3)(dot(newP, poseRot0), |
||||
dot(newP, poseRot1), |
||||
dot(newP, poseRot2)) + poseTrans; |
||||
newN = (float3)(dot(newN, poseRot0), |
||||
dot(newN, poseRot1), |
||||
dot(newN, poseRot2)); |
||||
|
||||
//find correspondence by projecting the point |
||||
float2 oldCoords = (newP.xy/newP.z)*fxy+cxy; |
||||
|
||||
if(!(all(oldCoords >= 0.f) && all(oldCoords < oldEdge))) |
||||
return; |
||||
|
||||
// bilinearly interpolate oldPts and oldNrm under oldCoords point |
||||
float3 oldP, oldN; |
||||
float2 ip = floor(oldCoords); |
||||
float2 t = oldCoords - ip; |
||||
int xi = ip.x, yi = ip.y; |
||||
|
||||
__global const ptype* prow0 = (__global const ptype*)(oldPointsptr + |
||||
oldPoints_offset + |
||||
(yi+0)*oldPoints_step); |
||||
__global const ptype* prow1 = (__global const ptype*)(oldPointsptr + |
||||
oldPoints_offset + |
||||
(yi+1)*oldPoints_step); |
||||
float3 p00 = prow0[xi+0].xyz; |
||||
float3 p01 = prow0[xi+1].xyz; |
||||
float3 p10 = prow1[xi+0].xyz; |
||||
float3 p11 = prow1[xi+1].xyz; |
||||
|
||||
// NaN check is done later |
||||
|
||||
__global const ptype* nrow0 = (__global const ptype*)(oldNormalsptr + |
||||
oldNormals_offset + |
||||
(yi+0)*oldNormals_step); |
||||
__global const ptype* nrow1 = (__global const ptype*)(oldNormalsptr + |
||||
oldNormals_offset + |
||||
(yi+1)*oldNormals_step); |
||||
|
||||
float3 n00 = nrow0[xi+0].xyz; |
||||
float3 n01 = nrow0[xi+1].xyz; |
||||
float3 n10 = nrow1[xi+0].xyz; |
||||
float3 n11 = nrow1[xi+1].xyz; |
||||
|
||||
// NaN check is done later |
||||
|
||||
float3 p0 = mix(p00, p01, t.x); |
||||
float3 p1 = mix(p10, p11, t.x); |
||||
oldP = mix(p0, p1, t.y); |
||||
|
||||
float3 n0 = mix(n00, n01, t.x); |
||||
float3 n1 = mix(n10, n11, t.x); |
||||
oldN = mix(n0, n1, t.y); |
||||
|
||||
if(any(isnan(oldP)) || any(isnan(oldN))) |
||||
return; |
||||
|
||||
//filter by distance |
||||
float3 diff = newP - oldP; |
||||
if(dot(diff, diff) > sqDistanceThresh) |
||||
return; |
||||
|
||||
//filter by angle |
||||
if(fabs(dot(newN, oldN)) < minCos) |
||||
return; |
||||
|
||||
// build point-wise vector ab = [ A | b ] |
||||
|
||||
float3 VxN = cross(newP, oldN); |
||||
float ab[7] = {VxN.x, VxN.y, VxN.z, oldN.x, oldN.y, oldN.z, -dot(oldN, diff)}; |
||||
|
||||
for(int i = 0; i < 7; i++) |
||||
ab7[i] = ab[i]; |
||||
} |
||||
|
||||
|
||||
__kernel void getAb(__global const char * oldPointsptr, |
||||
int oldPoints_step, int oldPoints_offset, |
||||
__global const char * oldNormalsptr, |
||||
int oldNormals_step, int oldNormals_offset, |
||||
const int2 oldSize, |
||||
__global const char * newPointsptr, |
||||
int newPoints_step, int newPoints_offset, |
||||
__global const char * newNormalsptr, |
||||
int newNormals_step, int newNormals_offset, |
||||
const int2 newSize, |
||||
const float16 poseMatrix, |
||||
const float2 fxy, |
||||
const float2 cxy, |
||||
const float sqDistanceThresh, |
||||
const float minCos, |
||||
__local float * reducebuf, |
||||
__global char* groupedSumptr, |
||||
int groupedSum_step, int groupedSum_offset |
||||
) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if(x >= newSize.x || y >= newSize.y) |
||||
return; |
||||
|
||||
const int gx = get_group_id(0); |
||||
const int gy = get_group_id(1); |
||||
const int gw = get_num_groups(0); |
||||
const int gh = get_num_groups(1); |
||||
|
||||
const int lx = get_local_id(0); |
||||
const int ly = get_local_id(1); |
||||
const int lw = get_local_size(0); |
||||
const int lh = get_local_size(1); |
||||
const int lsz = lw*lh; |
||||
const int lid = lx + ly*lw; |
||||
|
||||
float ab[7]; |
||||
for(int i = 0; i < 7; i++) |
||||
ab[i] = 0; |
||||
|
||||
calcAb7(oldPointsptr, |
||||
oldPoints_step, oldPoints_offset, |
||||
oldNormalsptr, |
||||
oldNormals_step, oldNormals_offset, |
||||
oldSize, |
||||
newPointsptr, |
||||
newPoints_step, newPoints_offset, |
||||
newNormalsptr, |
||||
newNormals_step, newNormals_offset, |
||||
newSize, |
||||
poseMatrix, |
||||
fxy, cxy, |
||||
sqDistanceThresh, |
||||
minCos, |
||||
ab); |
||||
|
||||
// build point-wise upper-triangle matrix [ab^T * ab] w/o last row |
||||
// which is [A^T*A | A^T*b] |
||||
// and gather sum |
||||
|
||||
__local float* upperTriangle = reducebuf + lid*UTSIZE; |
||||
|
||||
int pos = 0; |
||||
for(int i = 0; i < 6; i++) |
||||
{ |
||||
for(int j = i; j < 7; j++) |
||||
{ |
||||
upperTriangle[pos++] = ab[i]*ab[j]; |
||||
} |
||||
} |
||||
|
||||
// reduce upperTriangle to local mem |
||||
|
||||
// maxStep = ctz(lsz), ctz isn't supported on CUDA devices |
||||
const int c = clz(lsz & -lsz); |
||||
const int maxStep = c ? 31 - c : c; |
||||
for(int nstep = 1; nstep <= maxStep; nstep++) |
||||
{ |
||||
if(lid % (1 << nstep) == 0) |
||||
{ |
||||
__local float* rto = reducebuf + UTSIZE*lid; |
||||
__local float* rfrom = reducebuf + UTSIZE*(lid+(1 << (nstep-1))); |
||||
for(int i = 0; i < UTSIZE; i++) |
||||
rto[i] += rfrom[i]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
// here group sum should be in reducebuf[0...UTSIZE] |
||||
if(lid == 0) |
||||
{ |
||||
__global float* groupedRow = (__global float*)(groupedSumptr + |
||||
groupedSum_offset + |
||||
gy*groupedSum_step); |
||||
|
||||
for(int i = 0; i < UTSIZE; i++) |
||||
groupedRow[gx*UTSIZE + i] = reducebuf[i]; |
||||
} |
||||
} |
@ -0,0 +1,287 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html |
||||
|
||||
// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory |
||||
|
||||
inline float3 reproject(float3 p, float2 fxyinv, float2 cxy) |
||||
{ |
||||
float2 pp = p.z*(p.xy - cxy)*fxyinv; |
||||
return (float3)(pp, p.z); |
||||
} |
||||
|
||||
typedef float4 ptype; |
||||
|
||||
__kernel void computePointsNormals(__global char * pointsptr, |
||||
int points_step, int points_offset, |
||||
__global char * normalsptr, |
||||
int normals_step, int normals_offset, |
||||
__global const char * depthptr, |
||||
int depth_step, int depth_offset, |
||||
int depth_rows, int depth_cols, |
||||
const float2 fxyinv, |
||||
const float2 cxy, |
||||
const float dfac |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= depth_cols || y >= depth_rows) |
||||
return; |
||||
|
||||
__global const float* row0 = (__global const float*)(depthptr + depth_offset + |
||||
(y+0)*depth_step); |
||||
__global const float* row1 = (__global const float*)(depthptr + depth_offset + |
||||
(y+1)*depth_step); |
||||
|
||||
float d00 = row0[x]; |
||||
float z00 = d00*dfac; |
||||
float3 p00 = (float3)(convert_float2((int2)(x, y)), z00); |
||||
float3 v00 = reproject(p00, fxyinv, cxy); |
||||
|
||||
float3 p = nan((uint)0), n = nan((uint)0); |
||||
|
||||
if(x < depth_cols - 1 && y < depth_rows - 1) |
||||
{ |
||||
float d01 = row0[x+1]; |
||||
float d10 = row1[x]; |
||||
|
||||
float z01 = d01*dfac; |
||||
float z10 = d10*dfac; |
||||
|
||||
if(z00 != 0 && z01 != 0 && z10 != 0) |
||||
{ |
||||
float3 p01 = (float3)(convert_float2((int2)(x+1, y+0)), z01); |
||||
float3 p10 = (float3)(convert_float2((int2)(x+0, y+1)), z10); |
||||
float3 v01 = reproject(p01, fxyinv, cxy); |
||||
float3 v10 = reproject(p10, fxyinv, cxy); |
||||
|
||||
float3 vec = cross(v01 - v00, v10 - v00); |
||||
n = - normalize(vec); |
||||
p = v00; |
||||
} |
||||
} |
||||
|
||||
__global float* pts = (__global float*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); |
||||
__global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); |
||||
vstore4((ptype)(p, 0), 0, pts); |
||||
vstore4((ptype)(n, 0), 0, nrm); |
||||
} |
||||
|
||||
__kernel void pyrDownBilateral(__global const char * depthptr, |
||||
int depth_step, int depth_offset, |
||||
int depth_rows, int depth_cols, |
||||
__global char * depthDownptr, |
||||
int depthDown_step, int depthDown_offset, |
||||
int depthDown_rows, int depthDown_cols, |
||||
const float sigma |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= depthDown_cols || y >= depthDown_rows) |
||||
return; |
||||
|
||||
const float sigma3 = sigma*3; |
||||
const int D = 5; |
||||
|
||||
__global const float* srcCenterRow = (__global const float*)(depthptr + depth_offset + |
||||
(2*y)*depth_step); |
||||
|
||||
float center = srcCenterRow[2*x]; |
||||
|
||||
int sx = max(0, 2*x - D/2), ex = min(2*x - D/2 + D, depth_cols-1); |
||||
int sy = max(0, 2*y - D/2), ey = min(2*y - D/2 + D, depth_rows-1); |
||||
|
||||
float sum = 0; |
||||
int count = 0; |
||||
|
||||
for(int iy = sy; iy < ey; iy++) |
||||
{ |
||||
__global const float* srcRow = (__global const float*)(depthptr + depth_offset + |
||||
(iy)*depth_step); |
||||
for(int ix = sx; ix < ex; ix++) |
||||
{ |
||||
float val = srcRow[ix]; |
||||
if(fabs(val - center) < sigma3) |
||||
{ |
||||
sum += val; count++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__global float* downRow = (__global float*)(depthDownptr + depthDown_offset + |
||||
y*depthDown_step + x*sizeof(float)); |
||||
|
||||
*downRow = (count == 0) ? 0 : sum/convert_float(count); |
||||
} |
||||
|
||||
//TODO: remove bilateral when OpenCV performs 32f bilat with OpenCL |
||||
|
||||
__kernel void customBilateral(__global const char * srcptr, |
||||
int src_step, int src_offset, |
||||
__global char * dstptr, |
||||
int dst_step, int dst_offset, |
||||
const int2 frameSize, |
||||
const int kernelSize, |
||||
const float sigma_spatial2_inv_half, |
||||
const float sigma_depth2_inv_half |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= frameSize.x || y >= frameSize.y) |
||||
return; |
||||
|
||||
__global const float* srcCenterRow = (__global const float*)(srcptr + src_offset + |
||||
y*src_step); |
||||
float value = srcCenterRow[x]; |
||||
|
||||
int tx = min (x - kernelSize / 2 + kernelSize, frameSize.x - 1); |
||||
int ty = min (y - kernelSize / 2 + kernelSize, frameSize.y - 1); |
||||
|
||||
float sum1 = 0; |
||||
float sum2 = 0; |
||||
|
||||
for (int cy = max (y - kernelSize / 2, 0); cy < ty; ++cy) |
||||
{ |
||||
__global const float* srcRow = (__global const float*)(srcptr + src_offset + |
||||
cy*src_step); |
||||
for (int cx = max (x - kernelSize / 2, 0); cx < tx; ++cx) |
||||
{ |
||||
float depth = srcRow[cx]; |
||||
|
||||
float space2 = convert_float((x - cx) * (x - cx) + (y - cy) * (y - cy)); |
||||
float color2 = (value - depth) * (value - depth); |
||||
|
||||
float weight = native_exp (-(space2 * sigma_spatial2_inv_half + |
||||
color2 * sigma_depth2_inv_half)); |
||||
|
||||
sum1 += depth * weight; |
||||
sum2 += weight; |
||||
} |
||||
} |
||||
|
||||
__global float* dst = (__global float*)(dstptr + dst_offset + |
||||
y*dst_step + x*sizeof(float)); |
||||
*dst = sum1/sum2; |
||||
} |
||||
|
||||
__kernel void pyrDownPointsNormals(__global const char * pptr, |
||||
int p_step, int p_offset, |
||||
__global const char * nptr, |
||||
int n_step, int n_offset, |
||||
__global char * pdownptr, |
||||
int pdown_step, int pdown_offset, |
||||
__global char * ndownptr, |
||||
int ndown_step, int ndown_offset, |
||||
const int2 downSize |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= downSize.x || y >= downSize.y) |
||||
return; |
||||
|
||||
float3 point = nan((uint)0), normal = nan((uint)0); |
||||
|
||||
__global const ptype* pUpRow0 = (__global const ptype*)(pptr + p_offset + (2*y )*p_step); |
||||
__global const ptype* pUpRow1 = (__global const ptype*)(pptr + p_offset + (2*y+1)*p_step); |
||||
|
||||
float3 d00 = pUpRow0[2*x ].xyz; |
||||
float3 d01 = pUpRow0[2*x+1].xyz; |
||||
float3 d10 = pUpRow1[2*x ].xyz; |
||||
float3 d11 = pUpRow1[2*x+1].xyz; |
||||
|
||||
if(!(any(isnan(d00)) || any(isnan(d01)) || |
||||
any(isnan(d10)) || any(isnan(d11)))) |
||||
{ |
||||
point = (d00 + d01 + d10 + d11)*0.25f; |
||||
|
||||
__global const ptype* nUpRow0 = (__global const ptype*)(nptr + n_offset + (2*y )*n_step); |
||||
__global const ptype* nUpRow1 = (__global const ptype*)(nptr + n_offset + (2*y+1)*n_step); |
||||
|
||||
float3 n00 = nUpRow0[2*x ].xyz; |
||||
float3 n01 = nUpRow0[2*x+1].xyz; |
||||
float3 n10 = nUpRow1[2*x ].xyz; |
||||
float3 n11 = nUpRow1[2*x+1].xyz; |
||||
|
||||
normal = (n00 + n01 + n10 + n11)*0.25f; |
||||
} |
||||
|
||||
__global ptype* pts = (__global ptype*)(pdownptr + pdown_offset + y*pdown_step); |
||||
__global ptype* nrm = (__global ptype*)(ndownptr + ndown_offset + y*ndown_step); |
||||
pts[x] = (ptype)(point, 0); |
||||
nrm[x] = (ptype)(normal, 0); |
||||
} |
||||
|
||||
typedef char4 pixelType; |
||||
|
||||
// 20 is fixed power |
||||
float specPow20(float x) |
||||
{ |
||||
float x2 = x*x; |
||||
float x5 = x2*x2*x; |
||||
float x10 = x5*x5; |
||||
float x20 = x10*x10; |
||||
return x20; |
||||
} |
||||
|
||||
__kernel void render(__global const char * pointsptr, |
||||
int points_step, int points_offset, |
||||
__global const char * normalsptr, |
||||
int normals_step, int normals_offset, |
||||
__global char * imgptr, |
||||
int img_step, int img_offset, |
||||
const int2 frameSize, |
||||
const float4 lightPt |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= frameSize.x || y >= frameSize.y) |
||||
return; |
||||
|
||||
__global const ptype* ptsRow = (__global const ptype*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); |
||||
__global const ptype* nrmRow = (__global const ptype*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); |
||||
|
||||
float3 p = (*ptsRow).xyz; |
||||
float3 n = (*nrmRow).xyz; |
||||
|
||||
pixelType color; |
||||
|
||||
if(any(isnan(p))) |
||||
{ |
||||
color = (pixelType)(0, 32, 0, 0); |
||||
} |
||||
else |
||||
{ |
||||
const float Ka = 0.3f; //ambient coeff |
||||
const float Kd = 0.5f; //diffuse coeff |
||||
const float Ks = 0.2f; //specular coeff |
||||
//const int sp = 20; //specular power, fixed in specPow20() |
||||
|
||||
const float Ax = 1.f; //ambient color, can be RGB |
||||
const float Dx = 1.f; //diffuse color, can be RGB |
||||
const float Sx = 1.f; //specular color, can be RGB |
||||
const float Lx = 1.f; //light color |
||||
|
||||
float3 l = normalize(lightPt.xyz - p); |
||||
float3 v = normalize(-p); |
||||
float3 r = normalize(2.f*n*dot(n, l) - l); |
||||
|
||||
float val = (Ax*Ka*Dx + Lx*Kd*Dx*max(0.f, dot(n, l)) + |
||||
Lx*Ks*Sx*specPow20(max(0.f, dot(r, v)))); |
||||
|
||||
uchar ix = convert_uchar(val*255.f); |
||||
color = (pixelType)(ix, ix, ix, 0); |
||||
} |
||||
|
||||
__global char* imgRow = (__global char*)(imgptr + img_offset + y*img_step + x*sizeof(pixelType)); |
||||
vstore4(color, 0, imgRow); |
||||
} |
@ -0,0 +1,825 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html |
||||
|
||||
// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory |
||||
|
||||
__kernel void integrate(__global const char * depthptr, |
||||
int depth_step, int depth_offset, |
||||
int depth_rows, int depth_cols, |
||||
__global float2 * volumeptr, |
||||
const float16 vol2camMatrix, |
||||
const float voxelSize, |
||||
const int4 volResolution4, |
||||
const int4 volDims4, |
||||
const float2 fxy, |
||||
const float2 cxy, |
||||
const float dfac, |
||||
const float truncDist, |
||||
const int maxWeight) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
const int3 volResolution = volResolution4.xyz; |
||||
|
||||
if(x >= volResolution.x || y >= volResolution.y) |
||||
return; |
||||
|
||||
// coord-independent constants |
||||
const int3 volDims = volDims4.xyz; |
||||
const float2 limits = (float2)(depth_cols-1, depth_rows-1); |
||||
|
||||
const float4 vol2cam0 = vol2camMatrix.s0123; |
||||
const float4 vol2cam1 = vol2camMatrix.s4567; |
||||
const float4 vol2cam2 = vol2camMatrix.s89ab; |
||||
|
||||
const float truncDistInv = 1.f/truncDist; |
||||
|
||||
// optimization of camSpace transformation (vector addition instead of matmul at each z) |
||||
float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1); |
||||
float3 basePt = (float3)(dot(vol2cam0, inPt), |
||||
dot(vol2cam1, inPt), |
||||
dot(vol2cam2, inPt)); |
||||
|
||||
float3 camSpacePt = basePt; |
||||
|
||||
// zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt; |
||||
float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize; |
||||
|
||||
int volYidx = x*volDims.x + y*volDims.y; |
||||
|
||||
int startZ, endZ; |
||||
if(fabs(zStep.z) > 1e-5) |
||||
{ |
||||
int baseZ = convert_int(-basePt.z / zStep.z); |
||||
if(zStep.z > 0) |
||||
{ |
||||
startZ = baseZ; |
||||
endZ = volResolution.z; |
||||
} |
||||
else |
||||
{ |
||||
startZ = 0; |
||||
endZ = baseZ; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if(basePt.z > 0) |
||||
{ |
||||
startZ = 0; endZ = volResolution.z; |
||||
} |
||||
else |
||||
{ |
||||
// z loop shouldn't be performed |
||||
//startZ = endZ = 0; |
||||
return; |
||||
} |
||||
} |
||||
|
||||
startZ = max(0, startZ); |
||||
endZ = min(volResolution.z, endZ); |
||||
|
||||
for(int z = startZ; z < endZ; z++) |
||||
{ |
||||
// optimization of the following: |
||||
//float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize); |
||||
camSpacePt += zStep; |
||||
|
||||
if(camSpacePt.z <= 0) |
||||
continue; |
||||
|
||||
float3 camPixVec = camSpacePt / camSpacePt.z; |
||||
float2 projected = mad(camPixVec.xy, fxy, cxy); |
||||
|
||||
float v; |
||||
// bilinearly interpolate depth at projected |
||||
if(all(projected >= 0) && all(projected < limits)) |
||||
{ |
||||
float2 ip = floor(projected); |
||||
int xi = ip.x, yi = ip.y; |
||||
|
||||
__global const float* row0 = (__global const float*)(depthptr + depth_offset + |
||||
(yi+0)*depth_step); |
||||
__global const float* row1 = (__global const float*)(depthptr + depth_offset + |
||||
(yi+1)*depth_step); |
||||
|
||||
float v00 = row0[xi+0]; |
||||
float v01 = row0[xi+1]; |
||||
float v10 = row1[xi+0]; |
||||
float v11 = row1[xi+1]; |
||||
float4 vv = (float4)(v00, v01, v10, v11); |
||||
|
||||
// assume correct depth is positive |
||||
if(all(vv > 0)) |
||||
{ |
||||
float2 t = projected - ip; |
||||
float2 vf = mix(vv.xz, vv.yw, t.x); |
||||
v = mix(vf.s0, vf.s1, t.y); |
||||
} |
||||
else |
||||
continue; |
||||
} |
||||
else |
||||
continue; |
||||
|
||||
if(v == 0) |
||||
continue; |
||||
|
||||
float pixNorm = length(camPixVec); |
||||
|
||||
// difference between distances of point and of surface to camera |
||||
float sdf = pixNorm*(v*dfac - camSpacePt.z); |
||||
// possible alternative is: |
||||
// float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0); |
||||
|
||||
if(sdf >= -truncDist) |
||||
{ |
||||
float tsdf = fmin(1.0f, sdf * truncDistInv); |
||||
int volIdx = volYidx + z*volDims.z; |
||||
|
||||
float2 voxel = volumeptr[volIdx]; |
||||
float value = voxel.s0; |
||||
int weight = as_int(voxel.s1); |
||||
|
||||
// update TSDF |
||||
value = (value*weight + tsdf) / (weight + 1); |
||||
weight = min(weight + 1, maxWeight); |
||||
|
||||
voxel.s0 = value; |
||||
voxel.s1 = as_float(weight); |
||||
volumeptr[volIdx] = voxel; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
inline float interpolateVoxel(float3 p, __global const float2* volumePtr, |
||||
int3 volDims, int8 neighbourCoords) |
||||
{ |
||||
float3 fip = floor(p); |
||||
int3 ip = convert_int3(fip); |
||||
float3 t = p - fip; |
||||
|
||||
int3 cmul = volDims*ip; |
||||
int coordBase = cmul.x + cmul.y + cmul.z; |
||||
int nco[8]; |
||||
vstore8(neighbourCoords + coordBase, 0, nco); |
||||
|
||||
float vaz[8]; |
||||
for(int i = 0; i < 8; i++) |
||||
vaz[i] = volumePtr[nco[i]].s0; |
||||
|
||||
float8 vz = vload8(0, vaz); |
||||
|
||||
float4 vy = mix(vz.s0246, vz.s1357, t.z); |
||||
float2 vx = mix(vy.s02, vy.s13, t.y); |
||||
return mix(vx.s0, vx.s1, t.x); |
||||
} |
||||
|
||||
inline float3 getNormalVoxel(float3 p, __global const float2* volumePtr, |
||||
int3 volResolution, int3 volDims, int8 neighbourCoords) |
||||
{ |
||||
if(any(p < 1) || any(p >= convert_float3(volResolution - 2))) |
||||
return nan((uint)0); |
||||
|
||||
float3 fip = floor(p); |
||||
int3 ip = convert_int3(fip); |
||||
float3 t = p - fip; |
||||
|
||||
int3 cmul = volDims*ip; |
||||
int coordBase = cmul.x + cmul.y + cmul.z; |
||||
int nco[8]; |
||||
vstore8(neighbourCoords + coordBase, 0, nco); |
||||
|
||||
int arDims[3]; |
||||
vstore3(volDims, 0, arDims); |
||||
float an[3]; |
||||
for(int c = 0; c < 3; c++) |
||||
{ |
||||
int dim = arDims[c]; |
||||
|
||||
float vaz[8]; |
||||
for(int i = 0; i < 8; i++) |
||||
vaz[i] = volumePtr[nco[i] + dim].s0 - |
||||
volumePtr[nco[i] - dim].s0; |
||||
|
||||
float8 vz = vload8(0, vaz); |
||||
|
||||
float4 vy = mix(vz.s0246, vz.s1357, t.z); |
||||
float2 vx = mix(vy.s02, vy.s13, t.y); |
||||
|
||||
an[c] = mix(vx.s0, vx.s1, t.x); |
||||
} |
||||
|
||||
//gradientDeltaFactor is fixed at 1.0 of voxel size |
||||
return fast_normalize(vload3(0, an)); |
||||
} |
||||
|
||||
typedef float4 ptype; |
||||
|
||||
__kernel void raycast(__global char * pointsptr, |
||||
int points_step, int points_offset, |
||||
__global char * normalsptr, |
||||
int normals_step, int normals_offset, |
||||
const int2 frameSize, |
||||
__global const float2 * volumeptr, |
||||
__global const float * vol2camptr, |
||||
__global const float * cam2volptr, |
||||
const float2 fixy, |
||||
const float2 cxy, |
||||
const float4 boxDown4, |
||||
const float4 boxUp4, |
||||
const float tstep, |
||||
const float voxelSize, |
||||
const int4 volResolution4, |
||||
const int4 volDims4, |
||||
const int8 neighbourCoords |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= frameSize.x || y >= frameSize.y) |
||||
return; |
||||
|
||||
// coordinate-independent constants |
||||
|
||||
__global const float* cm = cam2volptr; |
||||
const float3 camRot0 = vload4(0, cm).xyz; |
||||
const float3 camRot1 = vload4(1, cm).xyz; |
||||
const float3 camRot2 = vload4(2, cm).xyz; |
||||
const float3 camTrans = (float3)(cm[3], cm[7], cm[11]); |
||||
|
||||
__global const float* vm = vol2camptr; |
||||
const float3 volRot0 = vload4(0, vm).xyz; |
||||
const float3 volRot1 = vload4(1, vm).xyz; |
||||
const float3 volRot2 = vload4(2, vm).xyz; |
||||
const float3 volTrans = (float3)(vm[3], vm[7], vm[11]); |
||||
|
||||
const float3 boxDown = boxDown4.xyz; |
||||
const float3 boxUp = boxUp4.xyz; |
||||
const int3 volDims = volDims4.xyz; |
||||
|
||||
const int3 volResolution = volResolution4.xyz; |
||||
|
||||
const float invVoxelSize = native_recip(voxelSize); |
||||
|
||||
// kernel itself |
||||
|
||||
float3 point = nan((uint)0); |
||||
float3 normal = nan((uint)0); |
||||
|
||||
float3 orig = camTrans; |
||||
|
||||
// get direction through pixel in volume space: |
||||
// 1. reproject (x, y) on projecting plane where z = 1.f |
||||
float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f); |
||||
|
||||
// 2. rotate to volume space |
||||
planed = (float3)(dot(planed, camRot0), |
||||
dot(planed, camRot1), |
||||
dot(planed, camRot2)); |
||||
|
||||
// 3. normalize |
||||
float3 dir = fast_normalize(planed); |
||||
|
||||
// compute intersection of ray with all six bbox planes |
||||
float3 rayinv = native_recip(dir); |
||||
float3 tbottom = rayinv*(boxDown - orig); |
||||
float3 ttop = rayinv*(boxUp - orig); |
||||
|
||||
// re-order intersections to find smallest and largest on each axis |
||||
float3 minAx = min(ttop, tbottom); |
||||
float3 maxAx = max(ttop, tbottom); |
||||
|
||||
// near clipping plane |
||||
const float clip = 0.f; |
||||
float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip); |
||||
float tmax = min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z)); |
||||
|
||||
// precautions against getting coordinates out of bounds |
||||
tmin = tmin + tstep; |
||||
tmax = tmax - tstep; |
||||
|
||||
if(tmin < tmax) |
||||
{ |
||||
// interpolation optimized a little |
||||
orig *= invVoxelSize; |
||||
dir *= invVoxelSize; |
||||
|
||||
float3 rayStep = dir*tstep; |
||||
float3 next = (orig + dir*tmin); |
||||
float f = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); |
||||
float fnext = f; |
||||
|
||||
// raymarch |
||||
int steps = 0; |
||||
int nSteps = floor(native_divide(tmax - tmin, tstep)); |
||||
bool stop = false; |
||||
for(int i = 0; i < nSteps; i++) |
||||
{ |
||||
// fix for wrong steps counting |
||||
if(!stop) |
||||
{ |
||||
next += rayStep; |
||||
|
||||
// fetch voxel |
||||
int3 ip = convert_int3(round(next)); |
||||
int3 cmul = ip*volDims; |
||||
int idx = cmul.x + cmul.y + cmul.z; |
||||
fnext = volumeptr[idx].s0; |
||||
|
||||
if(fnext != f) |
||||
{ |
||||
fnext = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); |
||||
|
||||
// when ray crosses a surface |
||||
if(signbit(f) != signbit(fnext)) |
||||
{ |
||||
stop = true; continue; |
||||
} |
||||
|
||||
f = fnext; |
||||
} |
||||
steps++; |
||||
} |
||||
} |
||||
|
||||
// if ray penetrates a surface from outside |
||||
// linearly interpolate t between two f values |
||||
if(f > 0 && fnext < 0) |
||||
{ |
||||
float3 tp = next - rayStep; |
||||
float ft = interpolateVoxel(tp, volumeptr, volDims, neighbourCoords); |
||||
float ftdt = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); |
||||
// float t = tmin + steps*tstep; |
||||
// float ts = t - tstep*ft/(ftdt - ft); |
||||
float ts = tmin + tstep*(steps - native_divide(ft, ftdt - ft)); |
||||
|
||||
// avoid division by zero |
||||
if(!isnan(ts) && !isinf(ts)) |
||||
{ |
||||
float3 pv = orig + dir*ts; |
||||
float3 nv = getNormalVoxel(pv, volumeptr, volResolution, volDims, neighbourCoords); |
||||
|
||||
if(!any(isnan(nv))) |
||||
{ |
||||
//convert pv and nv to camera space |
||||
normal = (float3)(dot(nv, volRot0), |
||||
dot(nv, volRot1), |
||||
dot(nv, volRot2)); |
||||
// interpolation optimized a little |
||||
pv *= voxelSize; |
||||
point = (float3)(dot(pv, volRot0), |
||||
dot(pv, volRot1), |
||||
dot(pv, volRot2)) + volTrans; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
__global float* pts = (__global float*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); |
||||
__global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); |
||||
vstore4((float4)(point, 0), 0, pts); |
||||
vstore4((float4)(normal, 0), 0, nrm); |
||||
} |
||||
|
||||
|
||||
__kernel void getNormals(__global const char * pointsptr, |
||||
int points_step, int points_offset, |
||||
__global char * normalsptr, |
||||
int normals_step, int normals_offset, |
||||
const int2 frameSize, |
||||
__global const float2* volumeptr, |
||||
__global const float * volPoseptr, |
||||
__global const float * invPoseptr, |
||||
const float voxelSizeInv, |
||||
const int4 volResolution4, |
||||
const int4 volDims4, |
||||
const int8 neighbourCoords |
||||
) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if(x >= frameSize.x || y >= frameSize.y) |
||||
return; |
||||
|
||||
// coordinate-independent constants |
||||
|
||||
__global const float* vp = volPoseptr; |
||||
const float3 volRot0 = vload4(0, vp).xyz; |
||||
const float3 volRot1 = vload4(1, vp).xyz; |
||||
const float3 volRot2 = vload4(2, vp).xyz; |
||||
const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); |
||||
|
||||
__global const float* iv = invPoseptr; |
||||
const float3 invRot0 = vload4(0, iv).xyz; |
||||
const float3 invRot1 = vload4(1, iv).xyz; |
||||
const float3 invRot2 = vload4(2, iv).xyz; |
||||
const float3 invTrans = (float3)(iv[3], iv[7], iv[11]); |
||||
|
||||
const int3 volResolution = volResolution4.xyz; |
||||
const int3 volDims = volDims4.xyz; |
||||
|
||||
// kernel itself |
||||
|
||||
__global const ptype* ptsRow = (__global const ptype*)(pointsptr + |
||||
points_offset + |
||||
y*points_step); |
||||
float3 p = ptsRow[x].xyz; |
||||
float3 n = nan((uint)0); |
||||
if(!any(isnan(p))) |
||||
{ |
||||
float3 voxPt = (float3)(dot(p, invRot0), |
||||
dot(p, invRot1), |
||||
dot(p, invRot2)) + invTrans; |
||||
voxPt = voxPt * voxelSizeInv; |
||||
n = getNormalVoxel(voxPt, volumeptr, volResolution, volDims, neighbourCoords); |
||||
n = (float3)(dot(n, volRot0), |
||||
dot(n, volRot1), |
||||
dot(n, volRot2)); |
||||
} |
||||
|
||||
__global float* nrm = (__global float*)(normalsptr + |
||||
normals_offset + |
||||
y*normals_step + |
||||
x*sizeof(ptype)); |
||||
|
||||
vstore4((float4)(n, 0), 0, nrm); |
||||
} |
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable |
||||
|
||||
struct CoordReturn |
||||
{ |
||||
bool result; |
||||
float3 point; |
||||
float3 normal; |
||||
}; |
||||
|
||||
inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axis, |
||||
__global const float2* volumeptr, |
||||
int3 volResolution, int3 volDims, |
||||
int8 neighbourCoords, |
||||
float voxelSize, float voxelSizeInv, |
||||
const float3 volRot0, |
||||
const float3 volRot1, |
||||
const float3 volRot2, |
||||
const float3 volTrans, |
||||
bool needNormals, |
||||
bool scan |
||||
) |
||||
{ |
||||
struct CoordReturn cr; |
||||
|
||||
// 0 for x, 1 for y, 2 for z |
||||
bool limits = false; |
||||
int3 shift; |
||||
float Vc = 0.f; |
||||
if(axis == 0) |
||||
{ |
||||
shift = (int3)(1, 0, 0); |
||||
limits = (x + 1 < volResolution.x); |
||||
Vc = V.x; |
||||
} |
||||
if(axis == 1) |
||||
{ |
||||
shift = (int3)(0, 1, 0); |
||||
limits = (y + 1 < volResolution.y); |
||||
Vc = V.y; |
||||
} |
||||
if(axis == 2) |
||||
{ |
||||
shift = (int3)(0, 0, 1); |
||||
limits = (z + 1 < volResolution.z); |
||||
Vc = V.z; |
||||
} |
||||
|
||||
if(limits) |
||||
{ |
||||
int3 ip = ((int3)(x, y, z)) + shift; |
||||
int3 cmul = ip*volDims; |
||||
int idx = cmul.x + cmul.y + cmul.z; |
||||
float2 voxel = volumeptr[idx].s0; |
||||
float vd = voxel.s0; |
||||
int weight = as_int(voxel.s1); |
||||
|
||||
if(weight != 0 && vd != 1.f) |
||||
{ |
||||
if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0)) |
||||
{ |
||||
// calc actual values or estimate amount of space |
||||
if(!scan) |
||||
{ |
||||
// linearly interpolate coordinate |
||||
float Vn = Vc + voxelSize; |
||||
float dinv = 1.f/(fabs(v0)+fabs(vd)); |
||||
float inter = (Vc*fabs(vd) + Vn*fabs(v0))*dinv; |
||||
|
||||
float3 p = (float3)(shift.x ? inter : V.x, |
||||
shift.y ? inter : V.y, |
||||
shift.z ? inter : V.z); |
||||
|
||||
cr.point = (float3)(dot(p, volRot0), |
||||
dot(p, volRot1), |
||||
dot(p, volRot2)) + volTrans; |
||||
|
||||
if(needNormals) |
||||
{ |
||||
float3 nv = getNormalVoxel(p * voxelSizeInv, |
||||
volumeptr, volResolution, volDims, neighbourCoords); |
||||
|
||||
cr.normal = (float3)(dot(nv, volRot0), |
||||
dot(nv, volRot1), |
||||
dot(nv, volRot2)); |
||||
} |
||||
} |
||||
|
||||
cr.result = true; |
||||
return cr; |
||||
} |
||||
} |
||||
} |
||||
|
||||
cr.result = false; |
||||
return cr; |
||||
} |
||||
|
||||
|
||||
__kernel void scanSize(__global const float2* volumeptr, |
||||
const int4 volResolution4, |
||||
const int4 volDims4, |
||||
const int8 neighbourCoords, |
||||
__global const float * volPoseptr, |
||||
const float voxelSize, |
||||
const float voxelSizeInv, |
||||
__local int* reducebuf, |
||||
__global char* groupedSumptr, |
||||
int groupedSum_slicestep, |
||||
int groupedSum_step, int groupedSum_offset |
||||
) |
||||
{ |
||||
const int3 volDims = volDims4.xyz; |
||||
const int3 volResolution = volResolution4.xyz; |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
int z = get_global_id(2); |
||||
|
||||
bool validVoxel = true; |
||||
if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z) |
||||
validVoxel = false; |
||||
|
||||
const int gx = get_group_id(0); |
||||
const int gy = get_group_id(1); |
||||
const int gz = get_group_id(2); |
||||
|
||||
const int lx = get_local_id(0); |
||||
const int ly = get_local_id(1); |
||||
const int lz = get_local_id(2); |
||||
const int lw = get_local_size(0); |
||||
const int lh = get_local_size(1); |
||||
const int ld = get_local_size(2); |
||||
const int lsz = lw*lh*ld; |
||||
const int lid = lx + ly*lw + lz*lw*lh; |
||||
|
||||
// coordinate-independent constants |
||||
|
||||
__global const float* vp = volPoseptr; |
||||
const float3 volRot0 = vload4(0, vp).xyz; |
||||
const float3 volRot1 = vload4(1, vp).xyz; |
||||
const float3 volRot2 = vload4(2, vp).xyz; |
||||
const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); |
||||
|
||||
// kernel itself |
||||
int npts = 0; |
||||
if(validVoxel) |
||||
{ |
||||
int3 ip = (int3)(x, y, z); |
||||
int3 cmul = ip*volDims; |
||||
int idx = cmul.x + cmul.y + cmul.z; |
||||
float2 voxel = volumeptr[idx].s0; |
||||
float value = voxel.s0; |
||||
int weight = as_int(voxel.s1); |
||||
|
||||
// if voxel is not empty |
||||
if(weight != 0 && value != 1.f) |
||||
{ |
||||
float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize; |
||||
|
||||
#pragma unroll |
||||
for(int i = 0; i < 3; i++) |
||||
{ |
||||
struct CoordReturn cr; |
||||
cr = coord(x, y, z, V, value, i, |
||||
volumeptr, volResolution, volDims, |
||||
neighbourCoords, |
||||
voxelSize, voxelSizeInv, |
||||
volRot0, volRot1, volRot2, volTrans, |
||||
false, true); |
||||
if(cr.result) |
||||
{ |
||||
npts++; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// reducebuf keeps counters for each thread |
||||
reducebuf[lid] = npts; |
||||
|
||||
// reduce counter to local mem |
||||
|
||||
// maxStep = ctz(lsz), ctz isn't supported on CUDA devices |
||||
const int c = clz(lsz & -lsz); |
||||
const int maxStep = c ? 31 - c : c; |
||||
for(int nstep = 1; nstep <= maxStep; nstep++) |
||||
{ |
||||
if(lid % (1 << nstep) == 0) |
||||
{ |
||||
int rto = lid; |
||||
int rfrom = lid + (1 << (nstep-1)); |
||||
reducebuf[rto] += reducebuf[rfrom]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if(lid == 0) |
||||
{ |
||||
__global int* groupedRow = (__global int*)(groupedSumptr + |
||||
groupedSum_offset + |
||||
gy*groupedSum_step + |
||||
gz*groupedSum_slicestep); |
||||
|
||||
groupedRow[gx] = reducebuf[0]; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void fillPtsNrm(__global const float2* volumeptr, |
||||
const int4 volResolution4, |
||||
const int4 volDims4, |
||||
const int8 neighbourCoords, |
||||
__global const float * volPoseptr, |
||||
const float voxelSize, |
||||
const float voxelSizeInv, |
||||
const int needNormals, |
||||
__local float* localbuf, |
||||
volatile __global int* atomicCtr, |
||||
__global const char* groupedSumptr, |
||||
int groupedSum_slicestep, |
||||
int groupedSum_step, int groupedSum_offset, |
||||
__global char * pointsptr, |
||||
int points_step, int points_offset, |
||||
__global char * normalsptr, |
||||
int normals_step, int normals_offset |
||||
) |
||||
{ |
||||
const int3 volDims = volDims4.xyz; |
||||
const int3 volResolution = volResolution4.xyz; |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
int z = get_global_id(2); |
||||
|
||||
bool validVoxel = true; |
||||
if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z) |
||||
validVoxel = false; |
||||
|
||||
const int gx = get_group_id(0); |
||||
const int gy = get_group_id(1); |
||||
const int gz = get_group_id(2); |
||||
|
||||
__global int* groupedRow = (__global int*)(groupedSumptr + |
||||
groupedSum_offset + |
||||
gy*groupedSum_step + |
||||
gz*groupedSum_slicestep); |
||||
|
||||
// this group contains 0 pts, skip it |
||||
int nptsGroup = groupedRow[gx]; |
||||
if(nptsGroup == 0) |
||||
return; |
||||
|
||||
const int lx = get_local_id(0); |
||||
const int ly = get_local_id(1); |
||||
const int lz = get_local_id(2); |
||||
const int lw = get_local_size(0); |
||||
const int lh = get_local_size(1); |
||||
const int ld = get_local_size(2); |
||||
const int lsz = lw*lh*ld; |
||||
const int lid = lx + ly*lw + lz*lw*lh; |
||||
|
||||
// coordinate-independent constants |
||||
|
||||
__global const float* vp = volPoseptr; |
||||
const float3 volRot0 = vload4(0, vp).xyz; |
||||
const float3 volRot1 = vload4(1, vp).xyz; |
||||
const float3 volRot2 = vload4(2, vp).xyz; |
||||
const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); |
||||
|
||||
// kernel itself |
||||
int npts = 0; |
||||
float3 parr[3], narr[3]; |
||||
if(validVoxel) |
||||
{ |
||||
int3 ip = (int3)(x, y, z); |
||||
int3 cmul = ip*volDims; |
||||
int idx = cmul.x + cmul.y + cmul.z; |
||||
float2 voxel = volumeptr[idx].s0; |
||||
float value = voxel.s0; |
||||
int weight = as_int(voxel.s1); |
||||
|
||||
// if voxel is not empty |
||||
if(weight != 0 && value != 1.f) |
||||
{ |
||||
float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize; |
||||
|
||||
#pragma unroll |
||||
for(int i = 0; i < 3; i++) |
||||
{ |
||||
struct CoordReturn cr; |
||||
cr = coord(x, y, z, V, value, i, |
||||
volumeptr, volResolution, volDims, |
||||
neighbourCoords, |
||||
voxelSize, voxelSizeInv, |
||||
volRot0, volRot1, volRot2, volTrans, |
||||
needNormals, false); |
||||
|
||||
if(cr.result) |
||||
{ |
||||
parr[npts] = cr.point; |
||||
narr[npts] = cr.normal; |
||||
npts++; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// 4 floats per point or normal |
||||
const int elemStep = 4; |
||||
|
||||
__local float* normAddr; |
||||
__local int localCtr; |
||||
if(lid == 0) |
||||
localCtr = 0; |
||||
|
||||
// push all pts (and nrm) from private array to local mem |
||||
int privateCtr = 0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
privateCtr = atomic_add(&localCtr, npts); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for(int i = 0; i < npts; i++) |
||||
{ |
||||
__local float* addr = localbuf + (privateCtr+i)*elemStep; |
||||
vstore4((float4)(parr[i], 0), 0, addr); |
||||
} |
||||
|
||||
if(needNormals) |
||||
{ |
||||
normAddr = localbuf + localCtr*elemStep; |
||||
|
||||
for(int i = 0; i < npts; i++) |
||||
{ |
||||
__local float* addr = normAddr + (privateCtr+i)*elemStep; |
||||
vstore4((float4)(narr[i], 0), 0, addr); |
||||
} |
||||
} |
||||
|
||||
// debugging purposes |
||||
if(lid == 0) |
||||
{ |
||||
if(localCtr != nptsGroup) |
||||
{ |
||||
printf("!!! fetchPointsNormals result may be incorrect, npts != localCtr at %3d %3d %3d: %3d vs %3d\n", |
||||
gx, gy, gz, localCtr, nptsGroup); |
||||
} |
||||
} |
||||
|
||||
// copy local buffer to global mem |
||||
__local int whereToWrite; |
||||
if(lid == 0) |
||||
whereToWrite = atomic_add(atomicCtr, localCtr); |
||||
barrier(CLK_GLOBAL_MEM_FENCE); |
||||
|
||||
event_t ev[2]; |
||||
int evn = 0; |
||||
// points and normals are 1-column matrices |
||||
__global float* pts = (__global float*)(pointsptr + |
||||
points_offset + |
||||
whereToWrite*points_step); |
||||
ev[evn++] = async_work_group_copy(pts, localbuf, localCtr*elemStep, 0); |
||||
|
||||
if(needNormals) |
||||
{ |
||||
__global float* nrm = (__global float*)(normalsptr + |
||||
normals_offset + |
||||
whereToWrite*normals_step); |
||||
ev[evn++] = async_work_group_copy(nrm, normAddr, localCtr*elemStep, 0); |
||||
} |
||||
|
||||
wait_group_events(evn, ev); |
||||
} |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue