Merge pull request #3614 from hipudding:ascendc

Enable AscendC kernel operator
1 year ago · bbce2ef9d6
parent 97137ec708 09ed18d280
commit bbce2ef9d6
15 changed files with 697 additions and 92 deletions
--- a/modules/cannops/CMakeLists.txt
+++ b/modules/cannops/CMakeLists.txt
@ -15,3 +15,9 @@ ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
 ocv_add_accuracy_tests(DEPENDS_ON opencv_cannops)
 ocv_add_perf_tests(DEPENDS_ON opencv_cannops)
 ocv_add_samples(opencv_cannops)
 # compile ascnedc kernels.
 add_subdirectory(ascendc_kernels)
 ocv_include_directories(${CMAKE_BINARY_DIR}/include/ascendc_kernels)
 ocv_target_link_libraries(opencv_cannops PRIVATE ascendc_kernels)
 ocv_target_link_libraries(opencv_test_cannops PRIVATE ascendc_kernels)
--- a/modules/cannops/ascendc_kernels/CMakeLists.txt
+++ b/modules/cannops/ascendc_kernels/CMakeLists.txt
@ -0,0 +1,17 @@
 set(SOC_VERSION "ascend310p3" CACHE STRING "system on chip type")
 set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
 set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")
 if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
 elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
 else()
    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
 endif()
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 ascendc_library(ascendc_kernels STATIC
    threshold_opencv_kernel.cpp
 )
--- a/modules/cannops/ascendc_kernels/kernel_tiling_types.h
+++ b/modules/cannops/ascendc_kernels/kernel_tiling_types.h
@ -0,0 +1,22 @@
 #ifndef KERNEL_TILING_H
 #define KERNEL_TILING_H
 /*
 * threshType:
 *   THRESH_BINARY     = 0,
 *   THRESH_BINARY_INV = 1,
 *   THRESH_TRUNC      = 2,
 *   THRESH_TOZERO     = 3,
 *   THRESH_TOZERO_INV = 4,
 */
 #pragma pack(push, 8)
 struct ThresholdOpencvTilingData
 {
    float maxVal;
    float thresh;
    uint32_t totalLength;
    uint8_t threshType;
    uint8_t dtype;
 };
 #pragma pack(pop)
 #endif // KERNEL_TILING_H
--- a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
+++ b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
@ -0,0 +1,379 @@
 #include "kernel_operator.h"
 #include "vector_tiling.h"
 #include "kernel_tiling_types.h"
 using namespace AscendC;
 // Make compiler happy. These two function will never be called.
 __aicore__ static inline void Cast(const LocalTensor<half>& dstLocal,
                                   const LocalTensor<half>& srcLocal, const RoundMode& round_mode,
                                   const uint32_t calCount){};
 __aicore__ static inline void Cast(const LocalTensor<float>& dstLocal,
                                   const LocalTensor<float>& srcLocal, const RoundMode& round_mode,
                                   const uint32_t calCount){};
 /**
 * T: input data type.
 * C: data type for calculate.
 * if T != C, data should cast from T to C.
 */
 template <typename T, typename C>
 class KernelThreshold
 {
 public:
    __aicore__ inline KernelThreshold() {}
    __aicore__ inline void Init(ThresholdOpencvTilingData* tiling, GM_ADDR x, GM_ADDR y)
    {
        tilingData = tiling;
        /**
         * Calculate memory use per element.
         * 1. InputQueue: sizeof(T) * BUFFER_NUM
         * 2. OutputQueue: sizeof(T) * BUFFER_NUM
         * 3. maskBuffer: 1 byte at most.
         */
        uint64_t bytesPerElem = sizeof(T) * BUFFER_NUM * 2 + sizeof(uint8_t) * 1;
        /**
         * If need cast, should init two more cast buffers.
         * Memory use per element:
         * 1. InputCastBuffer: sizeof(C)
         * 2. OutputCastBuffer: sizeof(C)
         */
        if (!std::is_same<T, C>::value)
        {
            bytesPerElem += sizeof(C) * 2;
        }
        // Most of AscendC APIs need align to 32 Bytes, but Compare and Select need
        // align to 256 Bytes, 256/sizeof(C) means how many element can be process
        // in one loop.
        vecTiling.calculate(tilingData->totalLength, GetBlockNum(), GetBlockIdx(), bytesPerElem,
                            256 / sizeof(C));
        xGM.SetGlobalBuffer((__gm__ T*)x + vecTiling.blockOffset, vecTiling.blockLength);
        yGM.SetGlobalBuffer((__gm__ T*)y + vecTiling.blockOffset, vecTiling.blockLength);
        // Cast buffer.
        if (!std::is_same<T, C>::value)
        {
            pipe.InitBuffer(InputCastBuffer, vecTiling.loopLength * sizeof(C));
            pipe.InitBuffer(outputCastBuffer, vecTiling.loopLength * sizeof(C));
        }
        pipe.InitBuffer(inputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T));
        pipe.InitBuffer(outputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T));
        pipe.InitBuffer(maskBuffer, vecTiling.loopLength * sizeof(uint8_t));
    }
    __aicore__ inline void Run()
    {
        for (uint32_t loop = 0; loop < vecTiling.loopCount; loop++)
        {
            uint32_t offset = loop * vecTiling.loopLength;
            Compute(offset, vecTiling.loopLength);
        }
        if (vecTiling.loopTailLength != 0)
        {
            uint32_t offset = vecTiling.loopCount * vecTiling.loopLength;
            Compute(offset, vecTiling.loopTailLength);
        }
    }
 private:
    __aicore__ inline void Compute(uint32_t offset, uint32_t len)
    {
        CopyIn(offset, len);
        // Get local Tensor, if case is need, local tensors come from
        // cast buffer. otherwise, local tensors come from input/output queue.
        LocalTensor<C> xLocal = CastInput(inputQueue, InputCastBuffer, len);
        LocalTensor<C> yLocal = GetOutput(outputQueue, outputCastBuffer);
        Threshold(xLocal, yLocal, len);
        // Free local input tensor if tensor is not from cast buffer.
        FreeInput(inputQueue, xLocal);
        // Cast output tensor to output queue if output tensor is from cast buffer.
        CastOutput(outputQueue, yLocal, len);
        CopyOut(offset, len);
    }
    /**
     * If need cast:
     * 1. Get data from input queue, this data can't be calculate directly.
     * 2. Get buffer with type C, which satisfied AscendC APIs.
     * 3. Cast data from T to C.
     *
     * If not need cast:
     * 1. Only need get data from queue.
     */
    __aicore__ inline LocalTensor<C> CastInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue,
                                               TBuf<TPosition::VECCALC>& buffer, uint32_t len)
    {
        LocalTensor<C> xLocal;
        if (std::is_same<T, C>::value)
        {
            xLocal = queue.DeQue<C>();
        }
        else
        {
            xLocal = buffer.Get<C>();
            LocalTensor<T> xCast = queue.DeQue<T>();
            Cast(xLocal, xCast, RoundMode::CAST_NONE, len);
            queue.FreeTensor(xCast);
        }
        return xLocal;
    }
    /**
     * If need cast:
     * 1. Get local tensor from cast buffer.
     *
     * If not need cast:
     * 1. Alloc local tensor from output queue.
     */
    __aicore__ inline LocalTensor<C> GetOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue,
                                               TBuf<TPosition::VECCALC>& buffer)
    {
        if (std::is_same<T, C>::value)
        {
            return queue.AllocTensor<C>();
        }
        else
        {
            return buffer.Get<C>();
        }
    }
    /**
     * If need cast:
     * 1. Input local tensor are get from cast buffer, which do not need free.
     *
     * If not need cast:
     * 1. Input local tensor are alloced from input queue, which need free.
     */
    __aicore__ inline void FreeInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue,
                                     LocalTensor<C>& xLocal)
    {
        if (std::is_same<T, C>::value)
        {
            queue.FreeTensor(xLocal);
        }
    }
    /**
     * If need cast:
     * 1. Alloc local tensor from output queue.
     * 2. Cast from C to T.
     * 3. Put casted local tensor in queue.
     *
     * If not need cast:
     * 1. Only put local tensor in queue.
     *
     */
    __aicore__ inline void CastOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue,
                                      LocalTensor<C>& yLocal, uint32_t len)
    {
        if (std::is_same<T, C>::value)
        {
            queue.EnQue(yLocal);
        }
        else
        {
            LocalTensor<T> yCast = queue.AllocTensor<T>();
            RoundMode roundMode = RoundMode::CAST_NONE;
            // Ref to AscendC cast API.
            if (std::is_same<T, int16_t>::value)
            {
                roundMode = RoundMode::CAST_RINT;
            }
            else if (std::is_same<T, int32_t>::value)
            {
                roundMode = RoundMode::CAST_ROUND;
            }
            Cast(yCast, yLocal, roundMode, len);
            queue.EnQue(yCast);
        }
    }
    __aicore__ inline void CopyIn(uint32_t offset, uint32_t len)
    {
        LocalTensor<T> xLocal = inputQueue.AllocTensor<T>();
        DataCopy(xLocal, xGM[offset], len);
        inputQueue.EnQue(xLocal);
    }
    __aicore__ inline void CopyOut(uint32_t offset, uint32_t len)
    {
        LocalTensor<T> yLocal = outputQueue.DeQue<T>();
        DataCopy(yGM[offset], yLocal, len);
        outputQueue.FreeTensor(yLocal);
    }
    /**
     * AscendC API Compare Warpper.
     * AscendC Compare level2 API need input length align to 256, process
     * tail data by level0 API.
     */
    __aicore__ inline void CompareWrap(const LocalTensor<uint8_t>& dstLocal,
                                       const LocalTensor<C>& src0Local,
                                       const LocalTensor<C>& src1Local, CMPMODE cmpMode,
                                       uint32_t calCount)
    {
        // Elements total count for on loop inside Compare.
        uint32_t batchCount = 256 / sizeof(C);
        // Tail elements count.
        uint32_t tailCount = calCount % batchCount;
        // Level2 API, calCount should align to 256.
        Compare(dstLocal, src0Local, src1Local, cmpMode, calCount - tailCount);
        // Data blocks are already cut align to 256, tail count will be 0 for
        // all process loops except last one.
        if (tailCount != 0)
        {
            BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8};
            uint32_t tailIdx = calCount - tailCount;
            uint32_t maskIdx = tailIdx / sizeof(uint8_t);
            Compare(dstLocal[maskIdx], src0Local[tailIdx], src1Local[tailIdx], cmpMode, tailCount,
                    1, repeatParams);
        }
    }
    /**
     * AscendC API Select Warpper.
     * AscendC Select level2 API need input length align to 256, process
     * tail data by level0 API.
     */
    __aicore__ inline void SelectWrap(const LocalTensor<C>& dstLocal,
                                      const LocalTensor<uint8_t>& selMask,
                                      const LocalTensor<C>& src0Local, C src1Local, SELMODE selMode,
                                      uint32_t calCount)
    {
        uint32_t batchCount = 256 / sizeof(C);
        uint32_t tailCount = calCount % batchCount;
        Select(dstLocal, selMask, src0Local, src1Local, selMode, calCount - tailCount);
        if (tailCount != 0)
        {
            BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8};
            uint32_t tailIdx = calCount - tailCount;
            uint32_t maskIdx = tailIdx / sizeof(uint8_t);
            Select(dstLocal[tailIdx], selMask[maskIdx], src0Local[tailIdx], src1Local, selMode,
                   tailCount, 1, repeatParams);
        }
    }
    __aicore__ inline void Threshold(LocalTensor<C>& xLocal, LocalTensor<C>& yLocal, uint32_t len)
    {
        LocalTensor<uint8_t> mask = maskBuffer.Get<uint8_t>();
        Duplicate(yLocal, static_cast<C>(tilingData->thresh), len);
        switch (tilingData->threshType)
        {
            case 0:
                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
                Duplicate(yLocal, static_cast<C>(0), len);
                SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal),
                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
                break;
            case 1:
                CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len);
                Duplicate(yLocal, static_cast<C>(0), len);
                SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal),
                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
                break;
            case 2:
                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
                SelectWrap(yLocal, mask, xLocal, static_cast<C>(tilingData->thresh),
                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
                break;
            case 3:
                CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len);
                SelectWrap(yLocal, mask, xLocal, static_cast<C>(0),
                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
                break;
            case 4:
                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
                SelectWrap(yLocal, mask, xLocal, static_cast<C>(0),
                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
                break;
            default:
                break;
        }
    }
    TPipe pipe;
    TQue<QuePosition::VECIN, BUFFER_NUM> inputQueue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outputQueue;
    TBuf<TPosition::VECCALC> InputCastBuffer, outputCastBuffer, maskBuffer;
    GlobalTensor<T> xGM, yGM;
    VectorTiling vecTiling;
    ThresholdOpencvTilingData* tilingData;
 };
 #define LAUNCH_THRESHOLD_KERNEL(NAME, T, C)                                                      \
    __aicore__ inline void launch_threshold_kernel_##NAME(ThresholdOpencvTilingData* tilingData, \
                                                          GM_ADDR x, GM_ADDR y)                  \
    {                                                                                            \
        KernelThreshold<T, C> op;                                                                \
        op.Init(tilingData, x, y);                                                               \
        op.Run();                                                                                \
    }
 LAUNCH_THRESHOLD_KERNEL(CV_8U, uint8_t, half)   // CV_8U
 LAUNCH_THRESHOLD_KERNEL(CV_8S, int8_t, half)    // CV_8S
                                                // CV_16U
 LAUNCH_THRESHOLD_KERNEL(CV_16S, int16_t, half)  // CV_16S
 LAUNCH_THRESHOLD_KERNEL(CV_32S, int32_t, float) // CV_32S
 LAUNCH_THRESHOLD_KERNEL(CV_32F, float, float)   // CV_32F
                                                // CV_64F
 LAUNCH_THRESHOLD_KERNEL(CV_16F, half, half)     // CV_16F
 #undef LAUNCH_THRESHOLD_KERNEL
 #define CALL_THRESHOLD_KERNEL(NAME) launch_threshold_kernel_##NAME
 extern "C" __global__ __aicore__ void threshold_opencv(GM_ADDR tilingGM, GM_ADDR x, GM_ADDR y)
 {
    ThresholdOpencvTilingData tilingData;
    auto tempTilingGM = (__gm__ uint8_t*)tilingGM;
    auto tempTiling = (uint8_t*)&tilingData;
    for (int32_t i = 0; i < sizeof(ThresholdOpencvTilingData) / sizeof(uint8_t);
         ++i, ++tempTilingGM, ++tempTiling)
    {
        *tempTiling = *tempTilingGM;
    }
    // AscendC can only call inline functions, function pointer can't be used here.
    // Use Macro and switch case instead.
    switch (tilingData.dtype)
    {
        case 0:
            CALL_THRESHOLD_KERNEL(CV_8U)(&tilingData, x, y);
            break;
        case 1:
            CALL_THRESHOLD_KERNEL(CV_8S)(&tilingData, x, y);
            break;
        case 3:
            CALL_THRESHOLD_KERNEL(CV_16S)(&tilingData, x, y);
            break;
        case 4:
            CALL_THRESHOLD_KERNEL(CV_32S)(&tilingData, x, y);
            break;
        case 5:
            CALL_THRESHOLD_KERNEL(CV_32F)(&tilingData, x, y);
            break;
        case 7:
            CALL_THRESHOLD_KERNEL(CV_16F)(&tilingData, x, y);
            break;
        case 2: case 6: default: // CV_16U, CV_64F
            break;
    }
    // Clear tiling GM cache manually. (cce compiler bug)
    dcci(tilingGM, 1);
 }
--- a/modules/cannops/ascendc_kernels/vector_tiling.h
+++ b/modules/cannops/ascendc_kernels/vector_tiling.h
@ -0,0 +1,77 @@
 #ifndef TILING_KERNEL_H
 #define TILING_KERNEL_H
 #ifdef __CCE_KT_TEST__
 #define __aicore__
 #else
 #define __aicore__ [aicore]
 #endif
 inline __aicore__ int32_t AlignNCeil(int32_t n, int32_t align) { return ((n + align) & ~(align-1)); }
 inline __aicore__ int32_t AlignNFloor(int32_t n, int32_t align) { return (n & ~(align-1)); }
 constexpr int32_t BUFFER_NUM = 2;
 constexpr int32_t UB_BUF_LEN = 248 * 1024;
 struct VectorTiling {
  __aicore__ inline void calculate(uint64_t _totalLength, uint64_t _blockNum,
                                   uint64_t _blockIdx, uint64_t _variableBytesPerElem, uint32_t _align) {
    totalLength = _totalLength;
    blockNum = _blockNum;
    blockIdx = _blockIdx;
    variableBytesPerElem = _variableBytesPerElem;
    blockLength = 0;
    blockOffset = 0;
    align = _align;
    GetBlockLengthAndOffset();
    GetLoopLengthAndCount();
 #ifdef __CCE_KT_TEST__
    std::cout << "Block(" << blockIdx << "): BlockLength = " << blockLength
              << ", BlockOffset = " << blockOffset
              << ", LoopLength = " << loopLength
              << ", LoopCount = " << loopCount
              << ", LoopTailLength = " << loopTailLength << std::endl;
 #endif
  }
  __aicore__ inline void GetBlockLengthAndOffset() {
    // Data should Align by 32B.
    uint32_t fullBlockLength = AlignNCeil(totalLength / blockNum, 32);
    // Some core may get no data after Align32 Ceil.
    uint32_t fullBlockNum = totalLength / fullBlockLength;
    uint32_t blockTailLength = totalLength % fullBlockLength;
    if (blockIdx < fullBlockNum) {
      blockLength = fullBlockLength;
      blockOffset = blockIdx * blockLength;
      // Last block must less than full block num.
    } else if (blockTailLength != 0 && blockIdx == fullBlockNum) {
      blockLength = blockTailLength;
      blockOffset = blockIdx * fullBlockLength;
    }
  }
  /**
   * @brief Get length for one loop and loop count.
   * Use as much UB buf as possible.
   */
  __aicore__ inline void GetLoopLengthAndCount() {
    loopLength = AlignNFloor(UB_BUF_LEN / variableBytesPerElem, align);
    loopCount = blockLength / loopLength;
    loopTailLength = blockLength - (loopLength * loopCount);
  }
  uint64_t totalLength;
  uint64_t blockNum;
  uint64_t blockIdx;
  uint64_t variableBytesPerElem;
  uint32_t blockLength;
  uint32_t blockOffset;
  uint32_t loopLength;
  uint32_t loopCount;
  uint32_t loopTailLength;
  uint32_t align;
 };
 #endif  // TILING_KERNEL_H
--- a/modules/cannops/include/opencv2/ascendc_kernels.hpp
+++ b/modules/cannops/include/opencv2/ascendc_kernels.hpp
@ -0,0 +1,7 @@
 #ifndef ASCENDC_KERNELS_H
 #define ASCENDC_KERNELS_H
 #include "../../ascendc_kernels/kernel_tiling_types.h"
 #include "aclrtlaunch_threshold_opencv.h"
 #endif //ASCENDC_KERNELS_H
--- a/modules/cannops/include/opencv2/cann_call.hpp
+++ b/modules/cannops/include/opencv2/cann_call.hpp
@ -9,7 +9,9 @@
 #include <set>
 #include <string>
 #include <acl/acl_base.h>
-#include "opencv2/cann.hpp"
+#include "cann.hpp"
 #include "stream_accessor.hpp"
 #include "ascendc_kernels.hpp"
 class aclopAttr;
@ -17,6 +19,15 @@ namespace cv
 {
 namespace cann
 {
 CV_EXPORTS void checkAclError(aclError err, const char* file, const int line, const char* func);
 void checkAclPtr(void* ptr, const char* file, const int line, const char* func);
 #define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
 #define CV_ACL_SAFE_CALL_PTR(expr)                     \
    ({                                                 \
        auto ptr = (expr);                             \
        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
        ptr;                                           \
    })
 // Warpper for functions in CANN, callers should not call CANN's api directly, but should call the
 // function provided in cann_call.
 void aclrtMallocWarpper(void** data, size_t size);
@ -39,7 +50,7 @@ void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count
 //! Type mapping between opencv and cann.
 aclDataType getACLType(int opencvdepth);
 //! Malloc and upload raw data to devices.
-std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+CV_EXPORTS std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
                                       AscendMat::Allocator* allocator);
 /**
 * @brief Warpper of CANN streams.
@ -151,6 +162,19 @@ public:
    OperatorRunner& run(AscendStream& stream);
 };
 template <typename KERNEL_TYPE, typename TILING_TYPE, typename... ARGS>
 void kernel_launch(KERNEL_TYPE kernel, AscendStream& stream, TILING_TYPE& tiling, ARGS... args)
 {
    std::shared_ptr<uchar> tilingDevice =
        mallocAndUpload(&tiling, sizeof(TILING_TYPE), stream, AscendMat::defaultAllocator());
    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
    CV_ACL_SAFE_CALL(kernel(1, rawStream, tilingDevice.get(), args...));
    if (rawStream == nullptr)
    {
        stream.waitForCompletion();
    }
 }
 } // namespace cann
 } // namespace cv
--- a/modules/cannops/perf/perf_element_operations.cpp
+++ b/modules/cannops/perf/perf_element_operations.cpp
@ -207,5 +207,25 @@ PERF_TEST_P(CPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES,
    SANITY_CHECK_NOTHING();
 }
 PERF_TEST_P(NPU, THRESHOLD_ASCENDC, testing::Combine(TYPICAL_ASCEND_MAT_SIZES,  Values(CV_8U, CV_16S, CV_32F)))
 {
    Mat mat(GET_PARAM(0), GET_PARAM(1));
    AscendMat dst;
    AscendMat src;
    src.upload(mat);
    declare.in(mat, WARMUP_RNG);
    TEST_CYCLE_N(10) { cv::cann::threshold(src, dst, 100.0, 255.0, cv::THRESH_BINARY); }
    SANITY_CHECK_NOTHING();
 }
 PERF_TEST_P(CPU, THRESHOLD, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, Values(CV_8U, CV_16S, CV_32F)))
 {
    Mat mat(GET_PARAM(0), GET_PARAM(1));
    Mat dst;
    declare.in(mat, WARMUP_RNG);
    TEST_CYCLE_N(10) { cv::threshold(mat, dst, 100.0, 255.0, cv::THRESH_BINARY); }
    SANITY_CHECK_NOTHING();
 }
 } // namespace
 } // namespace opencv_test
--- a/modules/cannops/src/ascend_mat.cpp
+++ b/modules/cannops/src/ascend_mat.cpp
@ -23,7 +23,11 @@ std::shared_ptr<uchar> DefaultAllocator::allocate(size_t size)
 bool DefaultAllocator::allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize)
 {
-    mat->data = allocate(elemSize * cols * rows);
+    size_t totalBytes = elemSize * cols * rows;
    // align by 32B.
    totalBytes = ((totalBytes + 32) & ~31);
    mat->data = allocate(totalBytes);
    mat->step = cols * elemSize;
    return true;
--- a/modules/cannops/src/cann_call.cpp
+++ b/modules/cannops/src/cann_call.cpp
@ -11,7 +11,7 @@ namespace cv
 namespace cann
 {
 /*******************************Acl Error Checker*****************************/
-static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+void checkAclError(aclError err, const char* file, const int line, const char* func)
 {
    if (ACL_SUCCESS != err)
    {
@ -20,7 +20,7 @@ static inline void checkAclError(aclError err, const char* file, const int line,
    }
 }
-static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
 {
    if (nullptr == ptr)
    {
@ -29,14 +29,6 @@ static inline void checkAclPtr(void* ptr, const char* file, const int line, cons
    }
 }
 #define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
 #define CV_ACL_SAFE_CALL_PTR(expr)                     \
    ({                                                 \
        auto ptr = (expr);                             \
        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
        ptr;                                           \
    })
 /******************************Acl Runtime Warpper****************************/
 void aclrtMallocWarpper(void** data, size_t size)
 {
--- a/modules/cannops/src/element_operations.cpp
+++ b/modules/cannops/src/element_operations.cpp
@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 #include "precomp.hpp"
 namespace cv
 {
 namespace cann
@ -110,8 +111,8 @@ static void convert(const Scalar& src, Scalar& dst, AscendStream& stream)
 }
 template <typename T1, typename T2>
-static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask, float scale,
+static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask,
-                      int dtype, const char* op, AscendStream& stream)
+                      float scale, int dtype, const char* op, AscendStream& stream)
 {
    T1 castedSrc1;
    T2 castedSrc2;
@ -170,8 +171,9 @@ static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const Asce
    }
 }
-static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst, const InputArray _mask,
+static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst,
-                      float scale, int dtype, const char* op, AscendStream& stream)
+                      const InputArray _mask, float scale, int dtype, const char* op,
                      AscendStream& stream)
 {
    const bool isScalar1 = (_src1.kind() == _InputArray::MATX);
    const bool isScalar2 = (_src2.kind() == _InputArray::MATX);
@ -213,56 +215,54 @@ static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArra
 }
 // In order to supply more interfaces, differnet function declaration shoule be done.
-void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
+void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-         AscendStream& stream)
+         int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
-void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-         AscendStream& stream)
+         int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
-void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
-         AscendStream& stream)
+         int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
-void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-         AscendStream& stream)
+         int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
-
+void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
+              int dtype, AscendStream& stream)
              AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
-void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-              AscendStream& stream)
+              int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
-void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
-              AscendStream& stream)
+              int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
-void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-              AscendStream& stream)
+              int dtype, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 void multiply(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
              AscendStream& stream)
 {
@ -287,7 +287,6 @@ void multiply(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float s
    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
 }
 void divide(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
            AscendStream& stream)
 {
@ -312,15 +311,14 @@ void divide(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float sca
    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
 }
-
+void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst,
-void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                 const InputArray mask, AscendStream& stream)
                 AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
-void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst,
-                 AscendStream& stream)
+                 const AscendMat& mask, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
@ -337,9 +335,8 @@ void bitwise_and(const Scalar& src1, const AscendMat& src2, AscendMat& dst, cons
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
-
+void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst,
-void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                const InputArray mask, AscendStream& stream)
                AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
 }
@ -362,15 +359,14 @@ void bitwise_or(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
 }
-
+void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst,
-void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                 const InputArray mask, AscendStream& stream)
                 AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
-void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst,
-                 AscendStream& stream)
+                 const AscendMat& mask, AscendStream& stream)
 {
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
@ -387,7 +383,6 @@ void bitwise_xor(const Scalar& src1, const AscendMat& src2, AscendMat& dst, cons
    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
 void bitwise_not(const InputArray src, OutputArray dst, const InputArray mask, AscendStream& stream)
 {
    arithm_op(src, noArray(), dst, mask, 1, -1, "Invert", stream);
@ -398,9 +393,8 @@ void bitwise_not(const AscendMat& src, AscendMat& dst, const AscendMat& mask, As
    arithm_op(src, AscendMat(), dst, mask, 1, -1, "Invert", stream);
 }
-
+void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta,
-void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta, double gamma,
+                 double gamma, AscendMat& dst, int dtype, AscendStream& stream)
                 AscendMat& dst, int dtype, AscendStream& stream)
 {
    if (dtype < 0)
        dtype = src1.depth();
@ -421,8 +415,8 @@ void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, dou
    arithm_op(srcWeightedSumRet, (float)gamma, dst, "Adds", stream);
 }
-void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta, double gamma,
+void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta,
-                 OutputArray _dst, int dtype, AscendStream& stream)
+                 double gamma, OutputArray _dst, int dtype, AscendStream& stream)
 {
    AscendMat src1, src2, dst;
    src1.upload(_src1, stream);
@ -442,45 +436,23 @@ double threshold(const AscendMat& src, AscendMat& dst, double thresh, double max
    dst.create(src.rows, src.cols, src.type());
-    OperatorRunner runner;
+    if (src.depth() == CV_8U || src.depth() == CV_8S || src.depth() == CV_16S ||
-    runner.setOp("Threshold")
+        src.depth() == CV_32S || src.depth() == CV_32F || src.depth() == CV_16F)
        .addInput(src, "x")
        .addOutput(threshMat, "y")
        .addAttr((float)thresh, "threshold")
        .run(stream);
    // THRESH_*_INV, THRESH_TRUNC need a inverse threshMat.
    // THRESH_BINARY_INV = 1, THRESH_TRUNC = 2, THRESH_TOZERO_INV = 4,
    if (type == 1 || type == 2 || type == 4)
    {
-        AscendMat threshInvMat(src.size(), src.type());
+        ThresholdOpencvTilingData tiling;
-        AscendMat ones(src.size(), src.type());
+        tiling.maxVal = maxval;
-        Scalar s(1, 1, 1, 1);
+        tiling.thresh = thresh;
-        ones.setTo(s, stream);
+        // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
-        arithm_op(ones, threshMat, threshInvMat, "Sub", stream);
+        size_t totalBytes = src.rows * src.cols * src.channels();
-
+        tiling.totalLength = ALIGN_UP(totalBytes, 32);
-        if (type == 1)
+        tiling.threshType = type;
-            arithm_op(threshInvMat, (float)maxval, dst, "Muls", stream);
+        tiling.dtype = src.depth();
-        else if (type == 2)
+
-        {
+        kernel_launch(aclrtlaunch_threshold_opencv, stream, tiling, src.data.get(), dst.data.get());
            AscendMat ToZeroInvMat(src.size(), src.type());
            AscendMat TruncMat(src.size(), src.type());
            arithm_op(threshInvMat, src, ToZeroInvMat, "Mul", stream);
            arithm_op(threshMat, (float)thresh, TruncMat, "Muls", stream);
            arithm_op(ToZeroInvMat, TruncMat, dst, "Add", stream);
        }
        else
            arithm_op(threshInvMat, src, dst, "Mul", stream);
    }
    else
-    {
+        CV_Error(Error::StsUnsupportedFormat, "");
-        if (type == 0) /* THRESH_BINARY = 0 */
+
            arithm_op(threshMat, (float)maxval, dst, "Muls", stream);
        else if (type == 3) /* THRESH_TOZERO = 3 */
            arithm_op(threshMat, src, dst, "Mul", stream);
        else
            CV_Error(Error::StsError, "Unknown/unsupported threshold type");
    }
    return thresh;
 }
--- a/modules/cannops/src/precomp.hpp
+++ b/modules/cannops/src/precomp.hpp
@ -10,5 +10,7 @@
 #include "opencv2/cann_call.hpp"
 #include "opencv2/cann_interface.hpp"
 #include "opencv2/cann_private.hpp"
 #include "opencv2/ascendc_kernels.hpp"
 #define ALIGN_UP(num, align) (((num) + (align) - 1) & ~((align) - 1))
 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cannops/test/test_element_operations.cpp
+++ b/modules/cannops/test/test_element_operations.cpp
@ -678,7 +678,6 @@ TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
    for (int i = 0; i <= 4; i++)
    {
        cv::threshold(cpuMat, cpuOpRet, 128, 250, i);
        // TODO find the reason empty AscendMat is not continuous.
        cv::cann::threshold(ascendMat16F, aclOpRet, 128, 250, i);
        aclOpRet.convertTo(aclOpRet16S, CV_16S);
        aclOpRet16S.download(checker);
@ -693,5 +692,37 @@ TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
    cv::cann::resetDevice();
 }
 TEST(ELEMENTWISE_OP, MAT_THRESHOLD_ASCENDC)
 {
    cv::cann::setDevice(DEVICE_ID);
    Mat cpuRet, npuRet;
    AscendMat npuImg, npuTmpMat;
    // opencv do not support CV_8S, CV_32S, CV_16F
    // ascend do not support CV_16U, CV_64F
    uint8_t dtypes[] = {CV_8U, CV_16S, CV_32F};
    for (uint i = 0; i <= 4; i++)
    {
        for (uint j = 0; j < sizeof(dtypes) / sizeof(dtypes[0]); j++)
        {
            double thresh = 90.5;
            double maxVal = 85.2;
            Mat img = randomMat(10, 10, CV_MAKETYPE(dtypes[j], 3), 0.0f, 128.0f);
            npuImg.upload(img);
            npuTmpMat.create(npuImg.rows, npuImg.cols, npuImg.type());
            cv::threshold(img, cpuRet, thresh, maxVal, i);
            cv::cann::threshold(npuImg, npuTmpMat, thresh, maxVal, i);
            npuTmpMat.download(npuRet);
            EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
        }
    }
    cv::cann::resetDevice();
 }
 } // namespace
 } // namespace opencv_test
--- a/modules/cannops/test/test_kernel.cpp
+++ b/modules/cannops/test/test_kernel.cpp
@ -0,0 +1,51 @@
 #include "test_precomp.hpp"
 #include "opencv2/cann_call.hpp"
 namespace opencv_test
 {
 namespace
 {
 TEST(ASCENDC_KERNEL, THRESHOLD)
 {
    cv::cann::setDevice(DEVICE_ID);
    Mat cpuRet, npuRet;
    AscendMat npuImg, npuTmpMat;
    // opencv do not support CV_8S, CV_32S, CV_16F
    // ascend do not support CV_16U, CV_64F
    uint8_t dtypes[] = {CV_8U, CV_16S, CV_32F};
    for (uint i = 0; i <= 4; i++)
    {
        for (uint j = 0; j < sizeof(dtypes) / sizeof(dtypes[0]); j++)
        {
            double thresh = 90.5;
            double maxVal = 85.2;
            Mat img = randomMat(10, 10, CV_MAKETYPE(dtypes[j], 3), 0.0f, 128.0f);
            npuImg.upload(img);
            npuTmpMat.create(npuImg.rows, npuImg.cols, npuImg.type());
            cv::threshold(img, cpuRet, thresh, maxVal, i);
            ThresholdOpencvTilingData tiling;
            tiling.maxVal = maxVal;
            tiling.thresh = thresh;
            size_t totalBytes = img.rows * img.cols * img.channels();
            // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
            tiling.totalLength = ((totalBytes + 32) & ~31);
            tiling.threshType = i;
            tiling.dtype = dtypes[j];
            kernel_launch(aclrtlaunch_threshold_opencv, AscendStream::Null(), tiling,
                          npuImg.data.get(), npuTmpMat.data.get());
            npuTmpMat.download(npuRet);
            EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
        }
    }
    cv::cann::resetDevice();
 }
 } // namespace
 } // namespace opencv_test
--- a/modules/cannops/test/test_precomp.hpp
+++ b/modules/cannops/test/test_precomp.hpp
@ -9,6 +9,7 @@
 #include "opencv2/cann.hpp"
 #include "opencv2/ts/cuda_test.hpp"
 #include "opencv2/cann_interface.hpp"
 #include "opencv2/ascendc_kernels.hpp"
 using namespace cv;
 using namespace cv::cann;