Merge pull request #2698 from DumDereDum:new_HashTSDF_implementation

New HashTSDF implementation * create new variables * rewrite reset() * first valid version of new HasHTSDF * some warning fixes * create lambda raycast * reduce time raycast * minor fix * minor fix volDims * changed _atVolumeUnit, reduce memory consumption * delete older inmplemetation of atVolumeUnit * changes _at * AAA, I want to cry! * it works! * it works twice o_o * minor fix * new adding to volumes * delete volDims at strust VolumeUnit * new names of vars * rename one var * minor fix * new resize volumes * rename volUnitsMatrix * minor fix in at function * add tsdf_functions.hpp * minor fix * remove two args at _at function signature * solved the link problem with tsdf_functions * build fix * build fix 1 * build fix 2 * build fix 3 * build fix 4 * replace integrateVolumeUnit to tsdf_functions and fix 2 warnings * docs fix * remove extra args at atVolumeUnit signature * change frame params checking * move volStrides to CPU class * inline convertion functions in tsdf_functions * minor fix * add SIMD version of integrateVolumeUnit * fix something :) * docs fix * warning fix * add degub asserts * replace vars initialization with reset() * remove volDims var * new resize buffer * minor vars name fix * docs fix * warning fix * minor fix * minor fix 1 * remove dbg asserts Co-authored-by: arsaratovtsev <artem.saratovtsev@intel.com>
5 years ago · 0b5ded4637
parent 7022f4e3e0
commit 0b5ded4637
5 changed files with 674 additions and 345 deletions
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@ -17,34 +17,22 @@
 #include "utils.hpp"

 #define USE_INTERPOLATION_IN_GETNORMAL 1
-
+#define VOLUMES_SIZE 1024

 namespace cv
 {
 namespace kinfu
 {

-static inline TsdfType floatToTsdf(float num)
-{
-    //CV_Assert(-1 < num <= 1);
-    int8_t res = int8_t(num * (-128.f));
-    res = res ? res : (num < 0 ? 1 : -1);
-    return res;
-}
-
-static inline float tsdfToFloat(TsdfType num)
-{
-    return float(num) * (-1.f / 128.f);
-}
-
-HashTSDFVolume::HashTSDFVolume(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist,
-                                        int _maxWeight, float _truncateThreshold, int _volumeUnitRes, bool _zFirstMemOrder)
+HashTSDFVolume::HashTSDFVolume(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor,
+    float _truncDist, int _maxWeight, float _truncateThreshold,
+    int _volumeUnitRes, bool _zFirstMemOrder)
    : Volume(_voxelSize, _pose, _raycastStepFactor),
-      maxWeight(_maxWeight),
-      truncateThreshold(_truncateThreshold),
-      volumeUnitResolution(_volumeUnitRes),
-      volumeUnitSize(voxelSize * volumeUnitResolution),
-      zFirstMemOrder(_zFirstMemOrder)
+    maxWeight(_maxWeight),
+    truncateThreshold(_truncateThreshold),
+    volumeUnitResolution(_volumeUnitRes),
+    volumeUnitSize(voxelSize* volumeUnitResolution),
+    zFirstMemOrder(_zFirstMemOrder)
 {
    truncDist = std::max(_truncDist, 4.0f * voxelSize);
 }
@ -54,6 +42,22 @@ HashTSDFVolumeCPU::HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, flo
    :HashTSDFVolume(_voxelSize, _pose, _raycastStepFactor, _truncDist, _maxWeight, _truncateThreshold, _volumeUnitRes,
           _zFirstMemOrder)
 {
+    int xdim, ydim, zdim;
+    if (zFirstMemOrder)
+    {
+        xdim = volumeUnitResolution * volumeUnitResolution;
+        ydim = volumeUnitResolution;
+        zdim = 1;
+    }
+    else
+    {
+        xdim = 1;
+        ydim = volumeUnitResolution;
+        zdim = volumeUnitResolution * volumeUnitResolution;
+    }
+    volStrides = Vec4i(xdim, ydim, zdim);
+
+    reset();
 }

 HashTSDFVolumeCPU::HashTSDFVolumeCPU(const VolumeParams& _params, bool _zFirstMemOrder)
@ -65,7 +69,8 @@ HashTSDFVolumeCPU::HashTSDFVolumeCPU(const VolumeParams& _params, bool _zFirstMe
 void HashTSDFVolumeCPU::reset()
 {
    CV_TRACE_FUNCTION();
-    volumeUnits.clear();
+    lastVolIndex = 0;
+    volUnitsData = cv::Mat(VOLUMES_SIZE, volumeUnitResolution * volumeUnitResolution * volumeUnitResolution, rawType<TsdfVoxel>());
 }

 void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose, const Intr& intrinsics, const int frameId)
@ -84,6 +89,7 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
    VolumeUnitIndexSet newIndices;
    Mutex mutex;
    Range allocateRange(0, depth.rows);
+
    auto AllocateVolumeUnitsInvoker = [&](const Range& range) {
        VolumeUnitIndexSet localAccessVolUnits;
        for (int y = range.start; y < range.end; y += depthStride)
@ -130,13 +136,22 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
    parallel_for_(allocateRange, AllocateVolumeUnitsInvoker);

    //! Perform the allocation
-    int res = volumeUnitResolution;
-    Point3i volumeDims(res, res, res);
    for (auto idx : newIndices)
    {
        VolumeUnit& vu = volumeUnits[idx];
        Matx44f subvolumePose = pose.translate(volumeUnitIdxToVolume(idx)).matrix;
-        vu.pVolume = makePtr<TSDFVolumeCPU>(voxelSize, subvolumePose, raycastStepFactor, truncDist, maxWeight, volumeDims);
+
+        vu.pose = subvolumePose;
+        vu.index = lastVolIndex; lastVolIndex++;
+        if (lastVolIndex > VolumeIndex(volUnitsData.size().height))
+        {
+            volUnitsData.resize((lastVolIndex - 1) * 2);
+        }
+        volUnitsData.row(vu.index).forEach<VecTsdfVoxel>([](VecTsdfVoxel& vv, const int* /* position */)
+            {
+                TsdfVoxel& v = reinterpret_cast<TsdfVoxel&>(vv);
+                v.tsdf = floatToTsdf(0.0f); v.weight = 0;
+            });
        //! This volume unit will definitely be required for current integration
        vu.lastVisibleIndex = frameId;
        vu.isActive = true;
@ -158,7 +173,7 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
        for (int i = range.start; i < range.end; ++i)
        {
            Vec3i tsdf_idx = totalVolUnits[i];
-            VolumeUnitMap::iterator it = volumeUnits.find(tsdf_idx);
+            VolumeUnitIndexes::iterator it = volumeUnits.find(tsdf_idx);
            if (it == volumeUnits.end())
                return;

@ -179,12 +194,21 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
        }
        });

+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+        intrinsics.fx, intrinsics.fy,
+        intrinsics.cx, intrinsics.cy);
+    if ( !(frameParams==newParams) )
+    {
+        frameParams = newParams;
+        pixNorms = preCalculationPixNorm(depth, intrinsics);
+    }
+
    //! Integrate the correct volumeUnits
    parallel_for_(Range(0, (int)totalVolUnits.size()), [&](const Range& range) {
        for (int i = range.start; i < range.end; i++)
        {
            Vec3i tsdf_idx = totalVolUnits[i];
-            VolumeUnitMap::iterator it = volumeUnits.find(tsdf_idx);
+            VolumeUnitIndexes::iterator it = volumeUnits.find(tsdf_idx);
            if (it == volumeUnits.end())
                return;

@ -192,7 +216,9 @@ void HashTSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Ma
            if (volumeUnit.isActive)
            {
                //! The volume unit should already be added into the Volume from the allocator
-                volumeUnit.pVolume->integrate(depth, depthFactor, cameraPose, intrinsics);
+                integrateVolumeUnit(truncDist, voxelSize, maxWeight, volumeUnit.pose, volumeUnitResolution, volStrides, depth,
+                    depthFactor, cameraPose, intrinsics, pixNorms, volUnitsData.row(volumeUnit.index));
+
                //! Ensure all active volumeUnits are set to inactive for next integration
                volumeUnit.isActive = false;
            }
@ -223,47 +249,70 @@ cv::Vec3i HashTSDFVolumeCPU::volumeToVoxelCoord(const cv::Point3f& point) const
                     cvFloor(point.z * voxelSizeInv));
 }

-TsdfVoxel HashTSDFVolumeCPU::at(const Vec3i& volumeIdx) const
+inline TsdfVoxel HashTSDFVolumeCPU::_at(const cv::Vec3i& volumeIdx, VolumeIndex indx) const
+{
+    //! Out of bounds
+    if ((volumeIdx[0] >= volumeUnitResolution || volumeIdx[0] < 0) ||
+        (volumeIdx[1] >= volumeUnitResolution || volumeIdx[1] < 0) ||
+        (volumeIdx[2] >= volumeUnitResolution || volumeIdx[2] < 0))
+    {
+        TsdfVoxel dummy;
+        dummy.tsdf = floatToTsdf(1.0f);
+        dummy.weight = 0;
+        return dummy;
+    }
+
+    const TsdfVoxel* volData = volUnitsData.ptr<TsdfVoxel>(indx);
+    int coordBase =
+        volumeIdx[0] * volStrides[0] + volumeIdx[1] * volStrides[1] + volumeIdx[2] * volStrides[2];
+    return volData[coordBase];
+}
+
+inline TsdfVoxel HashTSDFVolumeCPU::at(const cv::Vec3i& volumeIdx) const
+
 {
    Vec3i volumeUnitIdx = Vec3i(cvFloor(volumeIdx[0] / volumeUnitResolution),
                                cvFloor(volumeIdx[1] / volumeUnitResolution),
                                cvFloor(volumeIdx[2] / volumeUnitResolution));

-    VolumeUnitMap::const_iterator it = volumeUnits.find(volumeUnitIdx);
+    VolumeUnitIndexes::const_iterator it = volumeUnits.find(volumeUnitIdx);
+
    if (it == volumeUnits.end())
    {
        TsdfVoxel dummy;
-        dummy.tsdf   = floatToTsdf(1.f);
+        dummy.tsdf = floatToTsdf(1.f);
        dummy.weight = 0;
        return dummy;
    }
-    Ptr<TSDFVolumeCPU> volumeUnit = std::dynamic_pointer_cast<TSDFVolumeCPU>(it->second.pVolume);

-    Vec3i volUnitLocalIdx = volumeIdx - Vec3i(volumeUnitIdx[0] * volumeUnitResolution,
-                                              volumeUnitIdx[1] * volumeUnitResolution,
-                                              volumeUnitIdx[2] * volumeUnitResolution);
+    cv::Vec3i volUnitLocalIdx = volumeIdx - cv::Vec3i(volumeUnitIdx[0] * volumeUnitResolution,
+                                                      volumeUnitIdx[1] * volumeUnitResolution,
+                                                      volumeUnitIdx[2] * volumeUnitResolution);
+
+    volUnitLocalIdx =
+        cv::Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
+    return _at(volUnitLocalIdx, it->second.index);

-    volUnitLocalIdx = Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
-    return volumeUnit->at(volUnitLocalIdx);
 }

 TsdfVoxel HashTSDFVolumeCPU::at(const Point3f& point) const
 {
-    Vec3i volumeUnitIdx              = volumeToVolumeUnitIdx(point);
-    VolumeUnitMap::const_iterator it = volumeUnits.find(volumeUnitIdx);
+    cv::Vec3i volumeUnitIdx          = volumeToVolumeUnitIdx(point);
+    VolumeUnitIndexes::const_iterator it = volumeUnits.find(volumeUnitIdx);
+
    if (it == volumeUnits.end())
    {
        TsdfVoxel dummy;
-        dummy.tsdf   = floatToTsdf(1.f);
+        dummy.tsdf = floatToTsdf(1.f);
        dummy.weight = 0;
        return dummy;
    }
-    Ptr<TSDFVolumeCPU> volumeUnit = std::dynamic_pointer_cast<TSDFVolumeCPU>(it->second.pVolume);

-    Point3f volumeUnitPos = volumeUnitIdxToVolume(volumeUnitIdx);
-    Vec3i volUnitLocalIdx = volumeToVoxelCoord(point - volumeUnitPos);
-    volUnitLocalIdx       = Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
-    return volumeUnit->at(volUnitLocalIdx);
+    cv::Point3f volumeUnitPos = volumeUnitIdxToVolume(volumeUnitIdx);
+    cv::Vec3i volUnitLocalIdx = volumeToVoxelCoord(point - volumeUnitPos);
+    volUnitLocalIdx =
+        cv::Vec3i(abs(volUnitLocalIdx[0]), abs(volUnitLocalIdx[1]), abs(volUnitLocalIdx[2]));
+    return _at(volUnitLocalIdx, it->second.index);
 }

 static inline Vec3i voxelToVolumeUnitIdx(const Vec3i& pt, const int vuRes)
@ -282,27 +331,25 @@ static inline Vec3i voxelToVolumeUnitIdx(const Vec3i& pt, const int vuRes)
    }
 }

-inline TsdfVoxel atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitMap::const_iterator it,
-    VolumeUnitMap::const_iterator vend, int unitRes)
+TsdfVoxel HashTSDFVolumeCPU::atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const
 {
-    if (it == vend)
+    if (it == volumeUnits.end())
    {
        TsdfVoxel dummy;
        dummy.tsdf = floatToTsdf(1.f);
        dummy.weight = 0;
        return dummy;
    }
-    Ptr<TSDFVolumeCPU> volumeUnit = std::dynamic_pointer_cast<TSDFVolumeCPU>(it->second.pVolume);
-
-    Vec3i volUnitLocalIdx = point - volumeUnitIdx * unitRes;
+    Vec3i volUnitLocalIdx = point - volumeUnitIdx * volumeUnitResolution;

    // expanding at(), removing bounds check
-    const TsdfVoxel* volData = volumeUnit->volume.ptr<TsdfVoxel>();
-    Vec4i volDims = volumeUnit->volDims;
-    int coordBase = volUnitLocalIdx[0] * volDims[0] + volUnitLocalIdx[1] * volDims[1] + volUnitLocalIdx[2] * volDims[2];
+    const TsdfVoxel* volData = volUnitsData.ptr<TsdfVoxel>(it->second.index);
+    int coordBase = volUnitLocalIdx[0] * volStrides[0] + volUnitLocalIdx[1] * volStrides[1] + volUnitLocalIdx[2] * volStrides[2];
    return volData[coordBase];
 }

+
+
 #if USE_INTRINSICS
 inline float interpolate(float tx, float ty, float tz, float vx[8])
 {
@ -344,7 +391,7 @@ float HashTSDFVolumeCPU::interpolateVoxelPoint(const Point3f& point) const

    // A small hash table to reduce a number of find() calls
    bool queried[8];
-    VolumeUnitMap::const_iterator iterMap[8];
+    VolumeUnitIndexes::const_iterator iterMap[8];
    for (int i = 0; i < 8; i++)
    {
        iterMap[i] = volumeUnits.end();
@ -374,9 +421,8 @@ float HashTSDFVolumeCPU::interpolateVoxelPoint(const Point3f& point) const
            iterMap[dictIdx] = it;
            queried[dictIdx] = true;
        }
-        //VolumeUnitMap::const_iterator it = volumeUnits.find(volumeUnitIdx);

-        vx[i] = atVolumeUnit(pt, volumeUnitIdx, it, volumeUnits.end(), volumeUnitResolution).tsdf;
+        vx[i] = atVolumeUnit(pt, volumeUnitIdx, it).tsdf;
    }

    return interpolate(tx, ty, tz, vx);
@ -397,7 +443,7 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const

    // A small hash table to reduce a number of find() calls
    bool queried[8];
-    VolumeUnitMap::const_iterator iterMap[8];
+    VolumeUnitIndexes::const_iterator iterMap[8];
    for (int i = 0; i < 8; i++)
    {
        iterMap[i] = volumeUnits.end();
@ -438,9 +484,8 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
            iterMap[dictIdx] = it;
            queried[dictIdx] = true;
        }
-        //VolumeUnitMap::const_iterator it = volumeUnits.find(volumeUnitIdx);

-        vals[i] = tsdfToFloat(atVolumeUnit(pt, volumeUnitIdx, it, volumeUnits.end(), volumeUnitResolution).tsdf);
+        vals[i] = tsdfToFloat(atVolumeUnit(pt, volumeUnitIdx, it).tsdf);
    }

 #if !USE_INTERPOLATION_IN_GETNORMAL
@ -527,26 +572,33 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
    return nv < 0.0001f ? nan3 : normal / nv;
 }

-struct HashRaycastInvoker : ParallelLoopBody
+void HashTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const kinfu::Intr& intrinsics, const Size& frameSize,
+                                OutputArray _points, OutputArray _normals) const
 {
-    HashRaycastInvoker(Points& _points, Normals& _normals, const Matx44f& cameraPose, const Intr& intrinsics,
-                       const HashTSDFVolumeCPU& _volume)
-        : ParallelLoopBody(),
-          points(_points),
-          normals(_normals),
-          volume(_volume),
-          tstep(_volume.truncDist * _volume.raycastStepFactor),
-          cam2vol(volume.pose.inv() * Affine3f(cameraPose)),
-          vol2cam(Affine3f(cameraPose.inv()) * volume.pose),
-          reproj(intrinsics.makeReprojector())
-    {
-    }
+    CV_TRACE_FUNCTION();
+    CV_Assert(frameSize.area() > 0);
+
+    _points.create(frameSize, POINT_TYPE);
+    _normals.create(frameSize, POINT_TYPE);
+
+    Points points1   = _points.getMat();
+    Normals normals1 = _normals.getMat();
+
+    Points& points(points1);
+    Normals& normals(normals1);
+    const HashTSDFVolumeCPU& volume(*this);
+    const float tstep(volume.truncDist * volume.raycastStepFactor);
+    const Affine3f cam2vol(volume.pose.inv() * Affine3f(cameraPose));
+    const Affine3f vol2cam(Affine3f(cameraPose.inv()) * volume.pose);
+    const Intr::Reprojector reproj(intrinsics.makeReprojector());
+
+    const int nstripes = -1;

-    virtual void operator()(const Range& range) const override
+    auto _HashRaycastInvoker = [&](const Range& range)
    {
        const Point3f cam2volTrans = cam2vol.translation();
-        const Matx33f cam2volRot   = cam2vol.rotation();
-        const Matx33f vol2camRot   = vol2cam.rotation();
+        const Matx33f cam2volRot = cam2vol.rotation();
+        const Matx33f vol2camRot = vol2cam.rotation();

        const float blockSize = volume.volumeUnitSize;

@ -564,41 +616,45 @@ struct HashRaycastInvoker : ParallelLoopBody
                Point3f orig    = cam2volTrans;
                Point3f rayDirV = normalize(Vec3f(cam2volRot * reproj(Point3f(float(x), float(y), 1.f))));

-                float tmin  = 0;
-                float tmax  = volume.truncateThreshold;
+                float tmin = 0;
+                float tmax = volume.truncateThreshold;
                float tcurr = tmin;

-                Vec3i prevVolumeUnitIdx =
-                    Vec3i(std::numeric_limits<int>::min(), std::numeric_limits<int>::min(),
-                          std::numeric_limits<int>::min());
+                cv::Vec3i prevVolumeUnitIdx =
+                    cv::Vec3i(std::numeric_limits<int>::min(), std::numeric_limits<int>::min(),
+                        std::numeric_limits<int>::min());

-                float tprev       = tcurr;
+
+                float tprev = tcurr;
                float prevTsdf = volume.truncDist;
                Ptr<TSDFVolumeCPU> currVolumeUnit;
                while (tcurr < tmax)
                {
-                    Point3f currRayPos      = orig + tcurr * rayDirV;
-                    Vec3i currVolumeUnitIdx = volume.volumeToVolumeUnitIdx(currRayPos);
+                    Point3f currRayPos = orig + tcurr * rayDirV;
+                    cv::Vec3i currVolumeUnitIdx = volume.volumeToVolumeUnitIdx(currRayPos);
+

-                    VolumeUnitMap::const_iterator it = volume.volumeUnits.find(currVolumeUnitIdx);
+                    VolumeUnitIndexes::const_iterator it = volume.volumeUnits.find(currVolumeUnitIdx);

                    float currTsdf = prevTsdf;
-                    int currWeight    = 0;
-                    float stepSize    = 0.5f * blockSize;
-                    Vec3i volUnitLocalIdx;
+                    int currWeight = 0;
+                    float stepSize = 0.5f * blockSize;
+                    cv::Vec3i volUnitLocalIdx;
+

                    //! The subvolume exists in hashtable
                    if (it != volume.volumeUnits.end())
                    {
-                        currVolumeUnit         = std::dynamic_pointer_cast<TSDFVolumeCPU>(it->second.pVolume);
-                        Point3f currVolUnitPos = volume.volumeUnitIdxToVolume(currVolumeUnitIdx);
-                        volUnitLocalIdx        = volume.volumeToVoxelCoord(currRayPos - currVolUnitPos);
+                        cv::Point3f currVolUnitPos =
+                            volume.volumeUnitIdxToVolume(currVolumeUnitIdx);
+                        volUnitLocalIdx = volume.volumeToVoxelCoord(currRayPos - currVolUnitPos);
+

                        //! TODO: Figure out voxel interpolation
-                        TsdfVoxel currVoxel = currVolumeUnit->at(volUnitLocalIdx);
-                        currTsdf            = tsdfToFloat(currVoxel.tsdf);
-                        currWeight          = currVoxel.weight;
-                        stepSize            = tstep;
+                        TsdfVoxel currVoxel = _at(volUnitLocalIdx, it->second.index);
+                        currTsdf = tsdfToFloat(currVoxel.tsdf);
+                        currWeight = currVoxel.weight;
+                        stepSize = tstep;
                    }
                    //! Surface crossing
                    if (prevTsdf > 0.f && currTsdf <= 0.f && currWeight > 0)
@ -612,129 +668,92 @@ struct HashRaycastInvoker : ParallelLoopBody
                            if (!isNaN(nv))
                            {
                                normal = vol2camRot * nv;
-                                point  = vol2cam * pv;
+                                point = vol2cam * pv;
                            }
                        }
                        break;
                    }
                    prevVolumeUnitIdx = currVolumeUnitIdx;
-                    prevTsdf          = currTsdf;
-                    tprev             = tcurr;
+                    prevTsdf = currTsdf;
+                    tprev = tcurr;
                    tcurr += stepSize;
                }
                ptsRow[x] = toPtype(point);
                nrmRow[x] = toPtype(normal);
            }
        }
-    }
+    };

-    Points& points;
-    Normals& normals;
-    const HashTSDFVolumeCPU& volume;
-    const float tstep;
-    const Affine3f cam2vol;
-    const Affine3f vol2cam;
-    const Intr::Reprojector reproj;
-};
+    parallel_for_(Range(0, points.rows), _HashRaycastInvoker, nstripes);
+}

-void HashTSDFVolumeCPU::raycast(const Matx44f& cameraPose, const kinfu::Intr& intrinsics, const Size& frameSize,
-                                 OutputArray _points, OutputArray _normals) const
+void HashTSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
 {
    CV_TRACE_FUNCTION();
-    CV_Assert(frameSize.area() > 0);

-    _points.create(frameSize, POINT_TYPE);
-    _normals.create(frameSize, POINT_TYPE);
-
-    Points points   = _points.getMat();
-    Normals normals = _normals.getMat();
+    if (_points.needed())
+    {
+        std::vector<std::vector<ptype>> pVecs, nVecs;

-    HashRaycastInvoker ri(points, normals, cameraPose, intrinsics, *this);
+        std::vector<Vec3i> totalVolUnits;
+        for (const auto& keyvalue : volumeUnits)
+        {
+            totalVolUnits.push_back(keyvalue.first);
+        }
+        Range fetchRange(0, (int)totalVolUnits.size());
+        const int nstripes = -1;

-    const int nstripes = -1;
-    parallel_for_(Range(0, points.rows), ri, nstripes);
-}
+        const HashTSDFVolumeCPU& volume(*this);
+        bool needNormals(_normals.needed());
+        Mutex mutex;

-struct HashFetchPointsNormalsInvoker : ParallelLoopBody
-{
-    HashFetchPointsNormalsInvoker(const HashTSDFVolumeCPU& _volume, const std::vector<Vec3i>& _totalVolUnits,
-                                  std::vector<std::vector<ptype>>& _pVecs, std::vector<std::vector<ptype>>& _nVecs,
-                                  bool _needNormals)
-        : ParallelLoopBody(),
-          volume(_volume),
-          totalVolUnits(_totalVolUnits),
-          pVecs(_pVecs),
-          nVecs(_nVecs),
-          needNormals(_needNormals)
-    {
-    }

-    virtual void operator()(const Range& range) const override
-    {
-        std::vector<ptype> points, normals;
-        for (int i = range.start; i < range.end; i++)
+        auto HashFetchPointsNormalsInvoker = [&](const Range& range)
        {
-            Vec3i tsdf_idx = totalVolUnits[i];

-            VolumeUnitMap::const_iterator it = volume.volumeUnits.find(tsdf_idx);
-            Point3f base_point               = volume.volumeUnitIdxToVolume(tsdf_idx);
-            if (it != volume.volumeUnits.end())
+
+            std::vector<ptype> points, normals;
+            for (int i = range.start; i < range.end; i++)
            {
-                Ptr<TSDFVolumeCPU> volumeUnit = std::dynamic_pointer_cast<TSDFVolumeCPU>(it->second.pVolume);
-                std::vector<ptype> localPoints;
-                std::vector<ptype> localNormals;
-                for (int x = 0; x < volume.volumeUnitResolution; x++)
-                    for (int y = 0; y < volume.volumeUnitResolution; y++)
-                        for (int z = 0; z < volume.volumeUnitResolution; z++)
-                        {
-                            Vec3i voxelIdx(x, y, z);
-                            TsdfVoxel voxel = volumeUnit->at(voxelIdx);
+                cv::Vec3i tsdf_idx = totalVolUnits[i];
+

-                            if (voxel.tsdf != -128 && voxel.weight != 0)
+                VolumeUnitIndexes::const_iterator it = volume.volumeUnits.find(tsdf_idx);
+                Point3f base_point = volume.volumeUnitIdxToVolume(tsdf_idx);
+                if (it != volume.volumeUnits.end())
+                {
+                    std::vector<ptype> localPoints;
+                    std::vector<ptype> localNormals;
+                    for (int x = 0; x < volume.volumeUnitResolution; x++)
+                        for (int y = 0; y < volume.volumeUnitResolution; y++)
+                            for (int z = 0; z < volume.volumeUnitResolution; z++)
                            {
-                                Point3f point = base_point + volume.voxelCoordToVolume(voxelIdx);
-                                localPoints.push_back(toPtype(point));
-                                if (needNormals)
+                                cv::Vec3i voxelIdx(x, y, z);
+                                TsdfVoxel voxel = _at(voxelIdx, it->second.index);
+
+                                if (voxel.tsdf != -128 && voxel.weight != 0)
                                {
-                                    Point3f normal = volume.getNormalVoxel(point);
-                                    localNormals.push_back(toPtype(normal));
+                                    Point3f point = base_point + volume.voxelCoordToVolume(voxelIdx);
+                                    localPoints.push_back(toPtype(point));
+                                    if (needNormals)
+                                    {
+                                        Point3f normal = volume.getNormalVoxel(point);
+                                        localNormals.push_back(toPtype(normal));
+                                    }
                                }
                            }
-                        }

-                AutoLock al(mutex);
-                pVecs.push_back(localPoints);
-                nVecs.push_back(localNormals);
+                    AutoLock al(mutex);
+                    pVecs.push_back(localPoints);
+                    nVecs.push_back(localNormals);
+                }
            }
-        }
-    }
+        };

-    const HashTSDFVolumeCPU& volume;
-    std::vector<Vec3i> totalVolUnits;
-    std::vector<std::vector<ptype>>& pVecs;
-    std::vector<std::vector<ptype>>& nVecs;
-    const TsdfVoxel* volDataStart;
-    bool needNormals;
-    mutable Mutex mutex;
-};
+        parallel_for_(fetchRange, HashFetchPointsNormalsInvoker, nstripes);

-void HashTSDFVolumeCPU::fetchPointsNormals(OutputArray _points, OutputArray _normals) const
-{
-    CV_TRACE_FUNCTION();

-    if (_points.needed())
-    {
-        std::vector<std::vector<ptype>> pVecs, nVecs;

-        std::vector<Vec3i> totalVolUnits;
-        for (const auto& keyvalue : volumeUnits)
-        {
-            totalVolUnits.push_back(keyvalue.first);
-        }
-        HashFetchPointsNormalsInvoker fi(*this, totalVolUnits, pVecs, nVecs, _normals.needed());
-        Range range(0, (int)totalVolUnits.size());
-        const int nstripes = -1;
-        parallel_for_(range, fi, nstripes);
        std::vector<ptype> points, normals;
        for (size_t i = 0; i < pVecs.size(); i++)
        {
--- a/modules/rgbd/src/hash_tsdf.hpp
+++ b/modules/rgbd/src/hash_tsdf.hpp
@ -9,20 +9,30 @@
 #include <unordered_map>
 #include <unordered_set>

-#include "tsdf.hpp"
+#include "tsdf_functions.hpp"

 namespace cv
 {
 namespace kinfu
 {
-struct VolumeUnit
+class HashTSDFVolume : public Volume
 {
-    VolumeUnit() : pVolume(nullptr){};
-    ~VolumeUnit() = default;
+   public:
+    // dimension in voxels, size in meters
+    //! Use fixed volume cuboid
+    HashTSDFVolume(float _voxelSize, cv::Matx44f _pose, float _raycastStepFactor, float _truncDist,
+                   int _maxWeight, float _truncateThreshold, int _volumeUnitRes,
+                   bool zFirstMemOrder = true);

-    Ptr<TSDFVolume> pVolume;
-    int lastVisibleIndex = 0;
-    bool isActive;
+    virtual ~HashTSDFVolume() = default;
+
+   public:
+    int maxWeight;
+    float truncDist;
+    float truncateThreshold;
+    int volumeUnitResolution;
+    float volumeUnitSize;
+    bool zFirstMemOrder;
 };

 //! Spatial hashing
@ -40,32 +50,21 @@ struct tsdf_hash
    }
 };

-typedef std::unordered_set<Vec3i, tsdf_hash> VolumeUnitIndexSet;
-typedef std::unordered_map<Vec3i, VolumeUnit, tsdf_hash> VolumeUnitMap;
-
-class HashTSDFVolume : public Volume
+typedef unsigned int VolumeIndex;
+struct VolumeUnit
 {
-   public:
-    // dimension in voxels, size in meters
-    //! Use fixed volume cuboid
-    HashTSDFVolume(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist,
-                   int _maxWeight, float _truncateThreshold, int _volumeUnitRes,
-                   bool zFirstMemOrder = true);
-
-    virtual ~HashTSDFVolume() = default;
-
-   public:
-    int maxWeight;
-    float truncDist;
-    float truncateThreshold;
-    int volumeUnitResolution;
-    float volumeUnitSize;
-    bool zFirstMemOrder;
+    cv::Vec3i coord;
+    VolumeIndex index;
+    cv::Matx44f pose;
+    int lastVisibleIndex = 0;
+    bool isActive;
 };

+typedef std::unordered_set<cv::Vec3i, tsdf_hash> VolumeUnitIndexSet;
+typedef std::unordered_map<cv::Vec3i, VolumeUnit, tsdf_hash> VolumeUnitIndexes;
+
 class HashTSDFVolumeCPU : public HashTSDFVolume
 {
-
   public:
    // dimension in voxels, size in meters
    HashTSDFVolumeCPU(float _voxelSize, const Matx44f& _pose, float _raycastStepFactor, float _truncDist, int _maxWeight,
@ -90,7 +89,11 @@ class HashTSDFVolumeCPU : public HashTSDFVolume

    //! Return the voxel given the point in volume coordinate system i.e., (metric scale 1 unit =
    //! 1m)
-    TsdfVoxel at(const Point3f& point) const;
+    virtual TsdfVoxel at(const cv::Point3f& point) const;
+    virtual TsdfVoxel _at(const cv::Vec3i& volumeIdx, VolumeIndex indx) const;
+
+    TsdfVoxel atVolumeUnit(const Vec3i& point, const Vec3i& volumeUnitIdx, VolumeUnitIndexes::const_iterator it) const;
+

    float interpolateVoxelPoint(const Point3f& point) const;
    float interpolateVoxel(const cv::Point3f& point) const;
@ -104,8 +107,12 @@ class HashTSDFVolumeCPU : public HashTSDFVolume
    Vec3i volumeToVoxelCoord(const Point3f& point) const;

   public:
-    //! Hashtable of individual smaller volume units
-    VolumeUnitMap volumeUnits;
+       Vec4i volStrides;
+       Vec6f frameParams;
+       Mat pixNorms;
+       VolumeUnitIndexes volumeUnits;
+       cv::Mat volUnitsData;
+       VolumeIndex lastVolIndex;
 };

 template<typename T>
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@ -5,32 +5,14 @@
 // This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory

 #include "precomp.hpp"
-#include "tsdf.hpp"
+//#include "tsdf.hpp"
+#include "tsdf_functions.hpp"
 #include "opencl_kernels_rgbd.hpp"

 namespace cv {

 namespace kinfu {

-static inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
-{
-    v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
-    return v_cvt_f32(num) * num128;
-}
-
-static inline TsdfType floatToTsdf(float num)
-{
-    //CV_Assert(-1 < num <= 1);
-    int8_t res = int8_t(num * (-128.f));
-    res = res ? res : (num < 0 ? 1 : -1);
-    return res;
-}
-
-static inline float tsdfToFloat(TsdfType num)
-{
-    return float(num) * (-1.f / 128.f);
-}
-
 TSDFVolume::TSDFVolume(float _voxelSize, Matx44f _pose, float _raycastStepFactor, float _truncDist,
                       int _maxWeight, Point3i _resolution, bool zFirstMemOrder)
    : Volume(_voxelSize, _pose, _raycastStepFactor),
@ -117,85 +99,6 @@ TsdfVoxel TSDFVolumeCPU::at(const Vec3i& volumeIdx) const
    return volData[coordBase];
 }

-// SIMD version of that code is manually inlined
-#if !USE_INTRINSICS
-static const bool fixMissingData = false;
-
-static inline depthType bilinearDepth(const Depth& m, Point2f pt)
-{
-    const depthType defaultValue = qnan;
-    if(pt.x < 0 || pt.x >= m.cols-1 ||
-       pt.y < 0 || pt.y >= m.rows-1)
-        return defaultValue;
-
-    int xi = cvFloor(pt.x), yi = cvFloor(pt.y);
-
-    const depthType* row0 = m[yi+0];
-    const depthType* row1 = m[yi+1];
-
-    depthType v00 = row0[xi+0];
-    depthType v01 = row0[xi+1];
-    depthType v10 = row1[xi+0];
-    depthType v11 = row1[xi+1];
-
-    // assume correct depth is positive
-    bool b00 = v00 > 0;
-    bool b01 = v01 > 0;
-    bool b10 = v10 > 0;
-    bool b11 = v11 > 0;
-
-    if(!fixMissingData)
-    {
-        if(!(b00 && b01 && b10 && b11))
-            return defaultValue;
-        else
-        {
-            float tx = pt.x - xi, ty = pt.y - yi;
-            depthType v0 = v00 + tx*(v01 - v00);
-            depthType v1 = v10 + tx*(v11 - v10);
-            return v0 + ty*(v1 - v0);
-        }
-    }
-    else
-    {
-        int nz = b00 + b01 + b10 + b11;
-        if(nz == 0)
-        {
-            return defaultValue;
-        }
-        if(nz == 1)
-        {
-            if(b00) return v00;
-            if(b01) return v01;
-            if(b10) return v10;
-            if(b11) return v11;
-        }
-        if(nz == 2)
-        {
-            if(b00 && b10) v01 = v00, v11 = v10;
-            if(b01 && b11) v00 = v01, v10 = v11;
-            if(b00 && b01) v10 = v00, v11 = v01;
-            if(b10 && b11) v00 = v10, v01 = v11;
-            if(b00 && b11) v01 = v10 = (v00 + v11)*0.5f;
-            if(b01 && b10) v00 = v11 = (v01 + v10)*0.5f;
-        }
-        if(nz == 3)
-        {
-            if(!b00) v00 = v10 + v01 - v11;
-            if(!b01) v01 = v00 + v11 - v10;
-            if(!b10) v10 = v00 + v11 - v01;
-            if(!b11) v11 = v01 + v10 - v00;
-        }
-
-        float tx = pt.x - xi, ty = pt.y - yi;
-        depthType v0 = v00 + tx*(v01 - v00);
-        depthType v1 = v10 + tx*(v11 - v10);
-        return v0 + ty*(v1 - v0);
-    }
-}
-#endif
-
-

 struct IntegrateInvoker : ParallelLoopBody
 {
@ -462,30 +365,6 @@ struct IntegrateInvoker : ParallelLoopBody
    Mat pixNorms;
 };

-static cv::Mat preCalculationPixNorm(Depth depth, const Intr& intrinsics)
-{
-    int height = depth.rows;
-    int widht = depth.cols;
-    Point2f fl(intrinsics.fx, intrinsics.fy);
-    Point2f pp(intrinsics.cx, intrinsics.cy);
-    Mat pixNorm (height, widht, CV_32F);
-    std::vector<float> x(widht);
-    std::vector<float> y(height);
-    for (int i = 0; i < widht; i++)
-        x[i] = (i - pp.x) / fl.x;
-    for (int i = 0; i < height; i++)
-        y[i] = (i - pp.y) / fl.y;
-
-    for (int i = 0; i < height; i++)
-    {
-        for (int j = 0; j < widht; j++)
-        {
-            pixNorm.at<float>(i, j) = sqrtf(x[j] * x[j] + y[i] * y[i] + 1.0f);
-        }
-    }
-    return pixNorm;
-}
-
 // use depth instead of distance (optimization)
 void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44f& cameraPose,
                              const Intr& intrinsics, const int frameId)
@ -495,14 +374,13 @@ void TSDFVolumeCPU::integrate(InputArray _depth, float depthFactor, const Matx44
    CV_Assert(_depth.type() == DEPTH_TYPE);
    CV_Assert(!_depth.empty());
    Depth depth = _depth.getMat();
-    if (!(frameParams[0] == depth.rows  && frameParams[1] == depth.cols &&
-        frameParams[2] == intrinsics.fx && frameParams[3] == intrinsics.fy &&
-        frameParams[4] == intrinsics.cx && frameParams[5] == intrinsics.cy))
-    {
-        frameParams[0] = (float)depth.rows; frameParams[1] = (float)depth.cols;
-        frameParams[2] = intrinsics.fx;     frameParams[3] = intrinsics.fy;
-        frameParams[4] = intrinsics.cx;     frameParams[5] = intrinsics.cy;

+    Vec6f newParams((float)depth.rows, (float)depth.cols,
+        intrinsics.fx, intrinsics.fy,
+        intrinsics.cx, intrinsics.cy);
+    if (!(frameParams == newParams))
+    {
+        frameParams = newParams;
        pixNorms = preCalculationPixNorm(depth, intrinsics);
    }

--- a/modules/rgbd/src/tsdf_functions.cpp
+++ b/modules/rgbd/src/tsdf_functions.cpp
@ -0,0 +1,377 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
+
+#include "precomp.hpp"
+#include "tsdf_functions.hpp"
+
+namespace cv {
+
+namespace kinfu {
+
+cv::Mat preCalculationPixNorm(Depth depth, const Intr& intrinsics)
+{
+    int height = depth.rows;
+    int widht = depth.cols;
+    Point2f fl(intrinsics.fx, intrinsics.fy);
+    Point2f pp(intrinsics.cx, intrinsics.cy);
+    Mat pixNorm(height, widht, CV_32F);
+    std::vector<float> x(widht);
+    std::vector<float> y(height);
+    for (int i = 0; i < widht; i++)
+        x[i] = (i - pp.x) / fl.x;
+    for (int i = 0; i < height; i++)
+        y[i] = (i - pp.y) / fl.y;
+
+    for (int i = 0; i < height; i++)
+    {
+        for (int j = 0; j < widht; j++)
+        {
+            pixNorm.at<float>(i, j) = sqrtf(x[j] * x[j] + y[i] * y[i] + 1.0f);
+        }
+    }
+    return pixNorm;
+}
+
+const bool fixMissingData = false;
+depthType bilinearDepth(const Depth& m, cv::Point2f pt)
+{
+    const depthType defaultValue = qnan;
+    if (pt.x < 0 || pt.x >= m.cols - 1 ||
+        pt.y < 0 || pt.y >= m.rows - 1)
+        return defaultValue;
+
+    int xi = cvFloor(pt.x), yi = cvFloor(pt.y);
+
+    const depthType* row0 = m[yi + 0];
+    const depthType* row1 = m[yi + 1];
+
+    depthType v00 = row0[xi + 0];
+    depthType v01 = row0[xi + 1];
+    depthType v10 = row1[xi + 0];
+    depthType v11 = row1[xi + 1];
+
+    // assume correct depth is positive
+    bool b00 = v00 > 0;
+    bool b01 = v01 > 0;
+    bool b10 = v10 > 0;
+    bool b11 = v11 > 0;
+
+    if (!fixMissingData)
+    {
+        if (!(b00 && b01 && b10 && b11))
+            return defaultValue;
+        else
+        {
+            float tx = pt.x - xi, ty = pt.y - yi;
+            depthType v0 = v00 + tx * (v01 - v00);
+            depthType v1 = v10 + tx * (v11 - v10);
+            return v0 + ty * (v1 - v0);
+        }
+    }
+    else
+    {
+        int nz = b00 + b01 + b10 + b11;
+        if (nz == 0)
+        {
+            return defaultValue;
+        }
+        if (nz == 1)
+        {
+            if (b00) return v00;
+            if (b01) return v01;
+            if (b10) return v10;
+            if (b11) return v11;
+        }
+        if (nz == 2)
+        {
+            if (b00 && b10) v01 = v00, v11 = v10;
+            if (b01 && b11) v00 = v01, v10 = v11;
+            if (b00 && b01) v10 = v00, v11 = v01;
+            if (b10 && b11) v00 = v10, v01 = v11;
+            if (b00 && b11) v01 = v10 = (v00 + v11) * 0.5f;
+            if (b01 && b10) v00 = v11 = (v01 + v10) * 0.5f;
+        }
+        if (nz == 3)
+        {
+            if (!b00) v00 = v10 + v01 - v11;
+            if (!b01) v01 = v00 + v11 - v10;
+            if (!b10) v10 = v00 + v11 - v01;
+            if (!b11) v11 = v01 + v10 - v00;
+        }
+
+        float tx = pt.x - xi, ty = pt.y - yi;
+        depthType v0 = v00 + tx * (v01 - v00);
+        depthType v1 = v10 + tx * (v11 - v10);
+        return v0 + ty * (v1 - v0);
+    }
+}
+
+void integrateVolumeUnit(
+    float truncDist, float voxelSize, int maxWeight,
+    cv::Matx44f _pose, int volResolution, Vec4i volStrides,
+    InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
+    const cv::kinfu::Intr& intrinsics, InputArray _pixNorms, InputArray _volume)
+{
+    CV_TRACE_FUNCTION();
+
+    CV_Assert(_depth.type() == DEPTH_TYPE);
+    CV_Assert(!_depth.empty());
+    cv::Affine3f vpose(_pose);
+    Depth depth = _depth.getMat();
+
+    Range integrateRange(0, volResolution);
+
+    Mat volume = _volume.getMat();
+    Mat pixNorms = _pixNorms.getMat();
+    const Intr::Projector proj(intrinsics.makeProjector());
+    const cv::Affine3f vol2cam(Affine3f(cameraPose.inv()) * vpose);
+    const float truncDistInv(1.f / truncDist);
+    const float dfac(1.f / depthFactor);
+    TsdfVoxel* volDataStart = volume.ptr<TsdfVoxel>();;
+
+#if USE_INTRINSICS
+    auto IntegrateInvoker = [&](const Range& range)
+    {
+        // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
+        Point3f zStepPt = Point3f(vol2cam.matrix(0, 2),
+            vol2cam.matrix(1, 2),
+            vol2cam.matrix(2, 2)) * voxelSize;
+
+        v_float32x4 zStep(zStepPt.x, zStepPt.y, zStepPt.z, 0);
+        v_float32x4 vfxy(proj.fx, proj.fy, 0.f, 0.f), vcxy(proj.cx, proj.cy, 0.f, 0.f);
+        const v_float32x4 upLimits = v_cvt_f32(v_int32x4(depth.cols - 1, depth.rows - 1, 0, 0));
+
+        for (int x = range.start; x < range.end; x++)
+        {
+            TsdfVoxel* volDataX = volDataStart + x * volStrides[0];
+            for (int y = 0; y < volResolution; y++)
+            {
+                TsdfVoxel* volDataY = volDataX + y * volStrides[1];
+                // optimization of camSpace transformation (vector addition instead of matmul at each z)
+                Point3f basePt = vol2cam * (Point3f((float)x, (float)y, 0) * voxelSize);
+                v_float32x4 camSpacePt(basePt.x, basePt.y, basePt.z, 0);
+
+                int startZ, endZ;
+                if (abs(zStepPt.z) > 1e-5)
+                {
+                    int baseZ = (int)(-basePt.z / zStepPt.z);
+                    if (zStepPt.z > 0)
+                    {
+                        startZ = baseZ;
+                        endZ = volResolution;
+                    }
+                    else
+                    {
+                        startZ = 0;
+                        endZ = baseZ;
+                    }
+                }
+                else
+                {
+                    if (basePt.z > 0)
+                    {
+                        startZ = 0;
+                        endZ = volResolution;
+                    }
+                    else
+                    {
+                        // z loop shouldn't be performed
+                        startZ = endZ = 0;
+                    }
+                }
+                startZ = max(0, startZ);
+                endZ = min(int(volResolution), endZ);
+                for (int z = startZ; z < endZ; z++)
+                {
+                    // optimization of the following:
+                    //Point3f volPt = Point3f(x, y, z)*voxelSize;
+                    //Point3f camSpacePt = vol2cam * volPt;
+                    camSpacePt += zStep;
+
+                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    if (zCamSpace <= 0.f)
+                        continue;
+
+                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
+                    // leave only first 2 lanes
+                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
+                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+
+                    depthType v;
+                    // bilinearly interpolate depth at projected
+                    {
+                        const v_float32x4& pt = projected;
+                        // check coords >= 0 and < imgSize
+                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
+                            v_reinterpret_as_u32(pt >= upLimits);
+                        limits = limits | v_rotate_right<1>(limits);
+                        if (limits.get0())
+                            continue;
+
+                        // xi, yi = floor(pt)
+                        v_int32x4 ip = v_floor(pt);
+                        v_int32x4 ipshift = ip;
+                        int xi = ipshift.get0();
+                        ipshift = v_rotate_right<1>(ipshift);
+                        int yi = ipshift.get0();
+
+                        const depthType* row0 = depth[yi + 0];
+                        const depthType* row1 = depth[yi + 1];
+
+                        // v001 = [v(xi + 0, yi + 0), v(xi + 1, yi + 0)]
+                        v_float32x4 v001 = v_load_low(row0 + xi);
+                        // v101 = [v(xi + 0, yi + 1), v(xi + 1, yi + 1)]
+                        v_float32x4 v101 = v_load_low(row1 + xi);
+
+                        v_float32x4 vall = v_combine_low(v001, v101);
+
+                        // assume correct depth is positive
+                        // don't fix missing data
+                        if (v_check_all(vall > v_setzero_f32()))
+                        {
+                            v_float32x4 t = pt - v_cvt_f32(ip);
+                            float tx = t.get0();
+                            t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
+                            v_float32x4 ty = v_setall_f32(t.get0());
+                            // vx is y-interpolated between rows 0 and 1
+                            v_float32x4 vx = v001 + ty * (v101 - v001);
+                            float v0 = vx.get0();
+                            vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
+                            float v1 = vx.get0();
+                            v = v0 + tx * (v1 - v0);
+                        }
+                        else
+                            continue;
+                    }
+
+                    // norm(camPixVec) produces double which is too slow
+                    int _u = (int)projected.get0();
+                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
+                        continue;
+                    float pixNorm = pixNorms.at<float>(_v, _u);
+                    // float pixNorm = sqrt(v_reduce_sum(camPixVec*camPixVec));
+                    // difference between distances of point and of surface to camera
+                    float sdf = pixNorm * (v * dfac - zCamSpace);
+                    // possible alternative is:
+                    // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
+                    if (sdf >= -truncDist)
+                    {
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
+
+                        TsdfVoxel& voxel = volDataY[z * volStrides[2]];
+                        WeightType& weight = voxel.weight;
+                        TsdfType& value = voxel.tsdf;
+
+                        // update TSDF
+                        value = floatToTsdf((tsdfToFloat(value) * weight + tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = (weight + 1) < maxWeight ? (weight + 1) : (WeightType) maxWeight;
+                    }
+                }
+            }
+        }
+    };
+#else
+    auto IntegrateInvoker = [&](const Range& range)
+    {
+        for (int x = range.start; x < range.end; x++)
+        {
+            TsdfVoxel* volDataX = volDataStart + x * volStrides[0];
+            for (int y = 0; y < volResolution; y++)
+            {
+                TsdfVoxel* volDataY = volDataX + y * volStrides[1];
+                // optimization of camSpace transformation (vector addition instead of matmul at each z)
+                Point3f basePt = vol2cam * (Point3f(float(x), float(y), 0.0f) * voxelSize);
+                Point3f camSpacePt = basePt;
+                // zStep == vol2cam*(Point3f(x, y, 1)*voxelSize) - basePt;
+                // zStep == vol2cam*[Point3f(x, y, 1) - Point3f(x, y, 0)]*voxelSize
+                Point3f zStep = Point3f(vol2cam.matrix(0, 2),
+                    vol2cam.matrix(1, 2),
+                    vol2cam.matrix(2, 2)) * voxelSize;
+                int startZ, endZ;
+                if (abs(zStep.z) > 1e-5)
+                {
+                    int baseZ = int(-basePt.z / zStep.z);
+                    if (zStep.z > 0)
+                    {
+                        startZ = baseZ;
+                        endZ = volResolution;
+                    }
+                    else
+                    {
+                        startZ = 0;
+                        endZ = baseZ;
+                    }
+                }
+                else
+                {
+                    if (basePt.z > 0)
+                    {
+                        startZ = 0;
+                        endZ = volResolution;
+                    }
+                    else
+                    {
+                        // z loop shouldn't be performed
+                        startZ = endZ = 0;
+                    }
+                }
+                startZ = max(0, startZ);
+                endZ = min(int(volResolution), endZ);
+
+                for (int z = startZ; z < endZ; z++)
+                {
+                    // optimization of the following:
+                    //Point3f volPt = Point3f(x, y, z)*volume.voxelSize;
+                    //Point3f camSpacePt = vol2cam * volPt;
+
+                    camSpacePt += zStep;
+                    if (camSpacePt.z <= 0)
+                        continue;
+
+                    Point3f camPixVec;
+                    Point2f projected = proj(camSpacePt, camPixVec);
+
+                    depthType v = bilinearDepth(depth, projected);
+                    if (v == 0) {
+                        continue;
+                    }
+
+                    int _u = projected.x;
+                    int _v = projected.y;
+                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
+                        continue;
+                    float pixNorm = pixNorms.at<float>(_v, _u);
+
+                    // difference between distances of point and of surface to camera
+                    float sdf = pixNorm * (v * dfac - camSpacePt.z);
+                    // possible alternative is:
+                    // kftype sdf = norm(camSpacePt)*(v*dfac/camSpacePt.z - 1);
+                    if (sdf >= -truncDist)
+                    {
+                        TsdfType tsdf = floatToTsdf(fmin(1.f, sdf * truncDistInv));
+
+                        TsdfVoxel& voxel = volDataY[z * volStrides[2]];
+                        WeightType& weight = voxel.weight;
+                        TsdfType& value = voxel.tsdf;
+
+                        // update TSDF
+                        value = floatToTsdf((tsdfToFloat(value) * weight + tsdfToFloat(tsdf)) / (weight + 1));
+                        weight = min(int(weight + 1), int(maxWeight));
+                    }
+                }
+            }
+        }
+    };
+#endif
+
+    parallel_for_(integrateRange, IntegrateInvoker);
+
+}
+
+} // namespace kinfu
+} // namespace cv
--- a/modules/rgbd/src/tsdf_functions.hpp
+++ b/modules/rgbd/src/tsdf_functions.hpp
@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
+
+#ifndef __OPENCV_TSDF_FUNCTIONS_H__
+#define __OPENCV_TSDF_FUNCTIONS_H__
+
+#include <opencv2/rgbd/volume.hpp>
+#include "tsdf.hpp"
+
+namespace cv
+{
+namespace kinfu
+{
+
+inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
+{
+    v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
+    return v_cvt_f32(num) * num128;
+}
+
+inline TsdfType floatToTsdf(float num)
+{
+    //CV_Assert(-1 < num <= 1);
+    int8_t res = int8_t(num * (-128.f));
+    res = res ? res : (num < 0 ? 1 : -1);
+    return res;
+}
+
+inline float tsdfToFloat(TsdfType num)
+{
+    return float(num) * (-1.f / 128.f);
+}
+
+cv::Mat preCalculationPixNorm(Depth depth, const Intr& intrinsics);
+depthType bilinearDepth(const Depth& m, cv::Point2f pt);
+
+void integrateVolumeUnit(
+    float truncDist, float voxelSize, int maxWeight,
+    cv::Matx44f _pose, int volResolution, Vec4i volStrides,
+    InputArray _depth, float depthFactor, const cv::Matx44f& cameraPose,
+    const cv::kinfu::Intr& intrinsics, InputArray _pixNorms, InputArray _volume);
+
+}  // namespace kinfu
+}  // namespace cv
+#endif