379 lines
14 KiB
379 lines
14 KiB
#include "kernel_operator.h" |
|
#include "vector_tiling.h" |
|
#include "kernel_tiling_types.h" |
|
|
|
using namespace AscendC; |
|
|
|
// Make compiler happy. These two function will never be called. |
|
__aicore__ static inline void Cast(const LocalTensor<half>& dstLocal, |
|
const LocalTensor<half>& srcLocal, const RoundMode& round_mode, |
|
const uint32_t calCount){}; |
|
__aicore__ static inline void Cast(const LocalTensor<float>& dstLocal, |
|
const LocalTensor<float>& srcLocal, const RoundMode& round_mode, |
|
const uint32_t calCount){}; |
|
|
|
/** |
|
* T: input data type. |
|
* C: data type for calculate. |
|
* if T != C, data should cast from T to C. |
|
*/ |
|
template <typename T, typename C> |
|
class KernelThreshold |
|
{ |
|
public: |
|
__aicore__ inline KernelThreshold() {} |
|
__aicore__ inline void Init(ThresholdOpencvTilingData* tiling, GM_ADDR x, GM_ADDR y) |
|
{ |
|
tilingData = tiling; |
|
|
|
/** |
|
* Calculate memory use per element. |
|
* 1. InputQueue: sizeof(T) * BUFFER_NUM |
|
* 2. OutputQueue: sizeof(T) * BUFFER_NUM |
|
* 3. maskBuffer: 1 byte at most. |
|
*/ |
|
uint64_t bytesPerElem = sizeof(T) * BUFFER_NUM * 2 + sizeof(uint8_t) * 1; |
|
|
|
/** |
|
* If need cast, should init two more cast buffers. |
|
* Memory use per element: |
|
* 1. InputCastBuffer: sizeof(C) |
|
* 2. OutputCastBuffer: sizeof(C) |
|
*/ |
|
if (!std::is_same<T, C>::value) |
|
{ |
|
bytesPerElem += sizeof(C) * 2; |
|
} |
|
|
|
// Most of AscendC APIs need align to 32 Bytes, but Compare and Select need |
|
// align to 256 Bytes, 256/sizeof(C) means how many element can be process |
|
// in one loop. |
|
vecTiling.calculate(tilingData->totalLength, GetBlockNum(), GetBlockIdx(), bytesPerElem, |
|
256 / sizeof(C)); |
|
|
|
xGM.SetGlobalBuffer((__gm__ T*)x + vecTiling.blockOffset, vecTiling.blockLength); |
|
yGM.SetGlobalBuffer((__gm__ T*)y + vecTiling.blockOffset, vecTiling.blockLength); |
|
|
|
// Cast buffer. |
|
if (!std::is_same<T, C>::value) |
|
{ |
|
pipe.InitBuffer(InputCastBuffer, vecTiling.loopLength * sizeof(C)); |
|
pipe.InitBuffer(outputCastBuffer, vecTiling.loopLength * sizeof(C)); |
|
} |
|
|
|
pipe.InitBuffer(inputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T)); |
|
pipe.InitBuffer(outputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T)); |
|
pipe.InitBuffer(maskBuffer, vecTiling.loopLength * sizeof(uint8_t)); |
|
} |
|
|
|
__aicore__ inline void Run() |
|
{ |
|
for (uint32_t loop = 0; loop < vecTiling.loopCount; loop++) |
|
{ |
|
uint32_t offset = loop * vecTiling.loopLength; |
|
Compute(offset, vecTiling.loopLength); |
|
} |
|
|
|
if (vecTiling.loopTailLength != 0) |
|
{ |
|
uint32_t offset = vecTiling.loopCount * vecTiling.loopLength; |
|
Compute(offset, vecTiling.loopTailLength); |
|
} |
|
} |
|
|
|
private: |
|
__aicore__ inline void Compute(uint32_t offset, uint32_t len) |
|
{ |
|
CopyIn(offset, len); |
|
|
|
// Get local Tensor, if case is need, local tensors come from |
|
// cast buffer. otherwise, local tensors come from input/output queue. |
|
LocalTensor<C> xLocal = CastInput(inputQueue, InputCastBuffer, len); |
|
LocalTensor<C> yLocal = GetOutput(outputQueue, outputCastBuffer); |
|
|
|
Threshold(xLocal, yLocal, len); |
|
|
|
// Free local input tensor if tensor is not from cast buffer. |
|
FreeInput(inputQueue, xLocal); |
|
// Cast output tensor to output queue if output tensor is from cast buffer. |
|
CastOutput(outputQueue, yLocal, len); |
|
|
|
CopyOut(offset, len); |
|
} |
|
|
|
/** |
|
* If need cast: |
|
* 1. Get data from input queue, this data can't be calculate directly. |
|
* 2. Get buffer with type C, which satisfied AscendC APIs. |
|
* 3. Cast data from T to C. |
|
* |
|
* If not need cast: |
|
* 1. Only need get data from queue. |
|
*/ |
|
__aicore__ inline LocalTensor<C> CastInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue, |
|
TBuf<TPosition::VECCALC>& buffer, uint32_t len) |
|
{ |
|
LocalTensor<C> xLocal; |
|
if (std::is_same<T, C>::value) |
|
{ |
|
xLocal = queue.DeQue<C>(); |
|
} |
|
else |
|
{ |
|
xLocal = buffer.Get<C>(); |
|
LocalTensor<T> xCast = queue.DeQue<T>(); |
|
Cast(xLocal, xCast, RoundMode::CAST_NONE, len); |
|
queue.FreeTensor(xCast); |
|
} |
|
return xLocal; |
|
} |
|
|
|
/** |
|
* If need cast: |
|
* 1. Get local tensor from cast buffer. |
|
* |
|
* If not need cast: |
|
* 1. Alloc local tensor from output queue. |
|
*/ |
|
__aicore__ inline LocalTensor<C> GetOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue, |
|
TBuf<TPosition::VECCALC>& buffer) |
|
{ |
|
if (std::is_same<T, C>::value) |
|
{ |
|
return queue.AllocTensor<C>(); |
|
} |
|
else |
|
{ |
|
return buffer.Get<C>(); |
|
} |
|
} |
|
|
|
/** |
|
* If need cast: |
|
* 1. Input local tensor are get from cast buffer, which do not need free. |
|
* |
|
* If not need cast: |
|
* 1. Input local tensor are alloced from input queue, which need free. |
|
*/ |
|
__aicore__ inline void FreeInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue, |
|
LocalTensor<C>& xLocal) |
|
{ |
|
if (std::is_same<T, C>::value) |
|
{ |
|
queue.FreeTensor(xLocal); |
|
} |
|
} |
|
|
|
/** |
|
* If need cast: |
|
* 1. Alloc local tensor from output queue. |
|
* 2. Cast from C to T. |
|
* 3. Put casted local tensor in queue. |
|
* |
|
* If not need cast: |
|
* 1. Only put local tensor in queue. |
|
* |
|
*/ |
|
__aicore__ inline void CastOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue, |
|
LocalTensor<C>& yLocal, uint32_t len) |
|
{ |
|
if (std::is_same<T, C>::value) |
|
{ |
|
queue.EnQue(yLocal); |
|
} |
|
else |
|
{ |
|
LocalTensor<T> yCast = queue.AllocTensor<T>(); |
|
RoundMode roundMode = RoundMode::CAST_NONE; |
|
// Ref to AscendC cast API. |
|
if (std::is_same<T, int16_t>::value) |
|
{ |
|
roundMode = RoundMode::CAST_RINT; |
|
} |
|
else if (std::is_same<T, int32_t>::value) |
|
{ |
|
roundMode = RoundMode::CAST_ROUND; |
|
} |
|
Cast(yCast, yLocal, roundMode, len); |
|
queue.EnQue(yCast); |
|
} |
|
} |
|
|
|
__aicore__ inline void CopyIn(uint32_t offset, uint32_t len) |
|
{ |
|
LocalTensor<T> xLocal = inputQueue.AllocTensor<T>(); |
|
DataCopy(xLocal, xGM[offset], len); |
|
inputQueue.EnQue(xLocal); |
|
} |
|
|
|
__aicore__ inline void CopyOut(uint32_t offset, uint32_t len) |
|
{ |
|
LocalTensor<T> yLocal = outputQueue.DeQue<T>(); |
|
DataCopy(yGM[offset], yLocal, len); |
|
outputQueue.FreeTensor(yLocal); |
|
} |
|
|
|
/** |
|
* AscendC API Compare Warpper. |
|
* AscendC Compare level2 API need input length align to 256, process |
|
* tail data by level0 API. |
|
*/ |
|
__aicore__ inline void CompareWrap(const LocalTensor<uint8_t>& dstLocal, |
|
const LocalTensor<C>& src0Local, |
|
const LocalTensor<C>& src1Local, CMPMODE cmpMode, |
|
uint32_t calCount) |
|
{ |
|
// Elements total count for on loop inside Compare. |
|
uint32_t batchCount = 256 / sizeof(C); |
|
|
|
// Tail elements count. |
|
uint32_t tailCount = calCount % batchCount; |
|
|
|
// Level2 API, calCount should align to 256. |
|
Compare(dstLocal, src0Local, src1Local, cmpMode, calCount - tailCount); |
|
|
|
// Data blocks are already cut align to 256, tail count will be 0 for |
|
// all process loops except last one. |
|
if (tailCount != 0) |
|
{ |
|
BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8}; |
|
uint32_t tailIdx = calCount - tailCount; |
|
uint32_t maskIdx = tailIdx / sizeof(uint8_t); |
|
Compare(dstLocal[maskIdx], src0Local[tailIdx], src1Local[tailIdx], cmpMode, tailCount, |
|
1, repeatParams); |
|
} |
|
} |
|
|
|
/** |
|
* AscendC API Select Warpper. |
|
* AscendC Select level2 API need input length align to 256, process |
|
* tail data by level0 API. |
|
*/ |
|
__aicore__ inline void SelectWrap(const LocalTensor<C>& dstLocal, |
|
const LocalTensor<uint8_t>& selMask, |
|
const LocalTensor<C>& src0Local, C src1Local, SELMODE selMode, |
|
uint32_t calCount) |
|
{ |
|
uint32_t batchCount = 256 / sizeof(C); |
|
uint32_t tailCount = calCount % batchCount; |
|
|
|
Select(dstLocal, selMask, src0Local, src1Local, selMode, calCount - tailCount); |
|
if (tailCount != 0) |
|
{ |
|
BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8}; |
|
uint32_t tailIdx = calCount - tailCount; |
|
uint32_t maskIdx = tailIdx / sizeof(uint8_t); |
|
Select(dstLocal[tailIdx], selMask[maskIdx], src0Local[tailIdx], src1Local, selMode, |
|
tailCount, 1, repeatParams); |
|
} |
|
} |
|
|
|
__aicore__ inline void Threshold(LocalTensor<C>& xLocal, LocalTensor<C>& yLocal, uint32_t len) |
|
{ |
|
LocalTensor<uint8_t> mask = maskBuffer.Get<uint8_t>(); |
|
Duplicate(yLocal, static_cast<C>(tilingData->thresh), len); |
|
switch (tilingData->threshType) |
|
{ |
|
case 0: |
|
CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len); |
|
Duplicate(yLocal, static_cast<C>(0), len); |
|
SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal), |
|
SELMODE::VSEL_TENSOR_SCALAR_MODE, len); |
|
break; |
|
case 1: |
|
CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len); |
|
Duplicate(yLocal, static_cast<C>(0), len); |
|
SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal), |
|
SELMODE::VSEL_TENSOR_SCALAR_MODE, len); |
|
break; |
|
case 2: |
|
CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len); |
|
SelectWrap(yLocal, mask, xLocal, static_cast<C>(tilingData->thresh), |
|
SELMODE::VSEL_TENSOR_SCALAR_MODE, len); |
|
break; |
|
case 3: |
|
CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len); |
|
SelectWrap(yLocal, mask, xLocal, static_cast<C>(0), |
|
SELMODE::VSEL_TENSOR_SCALAR_MODE, len); |
|
break; |
|
case 4: |
|
CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len); |
|
SelectWrap(yLocal, mask, xLocal, static_cast<C>(0), |
|
SELMODE::VSEL_TENSOR_SCALAR_MODE, len); |
|
break; |
|
default: |
|
break; |
|
} |
|
} |
|
|
|
TPipe pipe; |
|
TQue<QuePosition::VECIN, BUFFER_NUM> inputQueue; |
|
TQue<QuePosition::VECOUT, BUFFER_NUM> outputQueue; |
|
TBuf<TPosition::VECCALC> InputCastBuffer, outputCastBuffer, maskBuffer; |
|
|
|
GlobalTensor<T> xGM, yGM; |
|
VectorTiling vecTiling; |
|
ThresholdOpencvTilingData* tilingData; |
|
}; |
|
|
|
#define LAUNCH_THRESHOLD_KERNEL(NAME, T, C) \ |
|
__aicore__ inline void launch_threshold_kernel_##NAME(ThresholdOpencvTilingData* tilingData, \ |
|
GM_ADDR x, GM_ADDR y) \ |
|
{ \ |
|
KernelThreshold<T, C> op; \ |
|
op.Init(tilingData, x, y); \ |
|
op.Run(); \ |
|
} |
|
|
|
LAUNCH_THRESHOLD_KERNEL(CV_8U, uint8_t, half) // CV_8U |
|
LAUNCH_THRESHOLD_KERNEL(CV_8S, int8_t, half) // CV_8S |
|
// CV_16U |
|
LAUNCH_THRESHOLD_KERNEL(CV_16S, int16_t, half) // CV_16S |
|
LAUNCH_THRESHOLD_KERNEL(CV_32S, int32_t, float) // CV_32S |
|
LAUNCH_THRESHOLD_KERNEL(CV_32F, float, float) // CV_32F |
|
// CV_64F |
|
LAUNCH_THRESHOLD_KERNEL(CV_16F, half, half) // CV_16F |
|
|
|
#undef LAUNCH_THRESHOLD_KERNEL |
|
|
|
#define CALL_THRESHOLD_KERNEL(NAME) launch_threshold_kernel_##NAME |
|
|
|
extern "C" __global__ __aicore__ void threshold_opencv(GM_ADDR tilingGM, GM_ADDR x, GM_ADDR y) |
|
{ |
|
ThresholdOpencvTilingData tilingData; |
|
auto tempTilingGM = (__gm__ uint8_t*)tilingGM; |
|
auto tempTiling = (uint8_t*)&tilingData; |
|
for (int32_t i = 0; i < sizeof(ThresholdOpencvTilingData) / sizeof(uint8_t); |
|
++i, ++tempTilingGM, ++tempTiling) |
|
{ |
|
*tempTiling = *tempTilingGM; |
|
} |
|
|
|
// AscendC can only call inline functions, function pointer can't be used here. |
|
// Use Macro and switch case instead. |
|
switch (tilingData.dtype) |
|
{ |
|
case 0: |
|
CALL_THRESHOLD_KERNEL(CV_8U)(&tilingData, x, y); |
|
break; |
|
case 1: |
|
CALL_THRESHOLD_KERNEL(CV_8S)(&tilingData, x, y); |
|
break; |
|
case 3: |
|
CALL_THRESHOLD_KERNEL(CV_16S)(&tilingData, x, y); |
|
break; |
|
case 4: |
|
CALL_THRESHOLD_KERNEL(CV_32S)(&tilingData, x, y); |
|
break; |
|
case 5: |
|
CALL_THRESHOLD_KERNEL(CV_32F)(&tilingData, x, y); |
|
break; |
|
case 7: |
|
CALL_THRESHOLD_KERNEL(CV_16F)(&tilingData, x, y); |
|
break; |
|
case 2: case 6: default: // CV_16U, CV_64F |
|
break; |
|
} |
|
// Clear tiling GM cache manually. (cce compiler bug) |
|
dcci(tilingGM, 1); |
|
}
|
|
|