From d9421ac148f3ad4878b3a75c58da58c24f19ed01 Mon Sep 17 00:00:00 2001
From: Junyan721113 <llh721113@outlook.com>
Date: Tue, 28 May 2024 19:25:53 +0800
Subject: [PATCH] Merge pull request #25167 from plctlab:rvp_3rdparty

3rdparty: NDSRVP - A New 3rdparty Library with Optimizations Based on RISC-V P Extension v0.5.2 - Part 1: Basic Functions #25167

# Summary

### Previous context
From PR #24556:

>> * As you wrote, the P-extension differs from RVV thus can not be easily implemented via Universal Intrinsics mechanism, but there is another HAL mechanism for lower-level CPU optimizations which is used by the [Carotene](https://github.com/opencv/opencv/tree/4.x/3rdparty/carotene) library on ARM platforms. I suggest moving all non-dnn code to similar third-party component. For example, FAST algorithm should allow such optimization-shortcut: see https://github.com/opencv/opencv/blob/4.x/modules/features2d/src/hal_replacement.hpp
>>   Reference documentation is here:
>>
>>   * https://docs.opencv.org/4.x/d1/d1b/group__core__hal__interface.html
>>   * https://docs.opencv.org/4.x/dd/d8b/group__imgproc__hal__interface.html
>>   * https://docs.opencv.org/4.x/db/d47/group__features2d__hal__interface.html
>>   * Carotene library is turned on here: https://github.com/opencv/opencv/blob/8bbf08f0de9c387c12afefdb05af7780d989e4c3/CMakeLists.txt#L906-L911

> As a test outside of this PR, A 3rdparty component called ndsrvp is created, containing one of the non-dnn code (integral_SIMD), and it works very well.
> All the non-dnn code in this PR have been removed, currently this PR can be focused on dnn optinizations.
> This HAL mechanism is quite suitable for rvp optimizations, all the non-dnn code is expected to be moved into ndsrvp soon.

### Progress

#### Part 1 (This PR)

- [Core](https://docs.opencv.org/4.x/d1/d1b/group__core__hal__interface.html)
- [x] Element-wise add and subtract
- [x] Element-wise minimum or maximum
- [x] Element-wise absolute difference
- [x] Bitwise logical operations
- [x] Element-wise compare
- [ImgProc](https://docs.opencv.org/4.x/dd/d8b/group__imgproc__hal__interface.html)
- [x] Integral
- [x] Threshold
- [x] WarpAffine
- [x] WarpPerspective
- [Features2D](https://docs.opencv.org/4.x/db/d47/group__features2d__hal__interface.html)

#### Part 2 (Next PR)

**Rough Estimate. Todo List May Change.**

- [Core](https://docs.opencv.org/4.x/d1/d1b/group__core__hal__interface.html)
- [ImgProc](https://docs.opencv.org/4.x/dd/d8b/group__imgproc__hal__interface.html)
- smaller remap HAL interface
- AdaptiveThreshold
- BoxFilter
- Canny
- Convert
- Filter
- GaussianBlur
- MedianBlur
- Morph
- Pyrdown
- Resize
- Scharr
- SepFilter
- Sobel
- [Features2D](https://docs.opencv.org/4.x/db/d47/group__features2d__hal__interface.html)
- FAST

### Performance Tests

The optimization does not contain floating point opreations.

**Absolute Difference**

Geometric mean (ms)

|Name of Test|opencv perf core Absdiff|opencv perf core Absdiff|opencv perf core Absdiff vs opencv perf core Absdiff (x-factor)|
|---|:-:|:-:|:-:|
|Absdiff::OCL_AbsDiffFixture::(640x480, 8UC1)|23.104|5.972|3.87|
|Absdiff::OCL_AbsDiffFixture::(640x480, 32FC1)|39.500|40.830|0.97|
|Absdiff::OCL_AbsDiffFixture::(640x480, 8UC3)|69.155|15.051|4.59|
|Absdiff::OCL_AbsDiffFixture::(640x480, 32FC3)|118.715|120.509|0.99|
|Absdiff::OCL_AbsDiffFixture::(640x480, 8UC4)|93.001|19.770|4.70|
|Absdiff::OCL_AbsDiffFixture::(640x480, 32FC4)|161.136|160.791|1.00|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 8UC1)|69.211|15.140|4.57|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 32FC1)|118.762|119.263|1.00|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 8UC3)|212.414|44.692|4.75|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 32FC3)|367.512|366.569|1.00|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 8UC4)|285.337|59.708|4.78|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 32FC4)|490.395|491.118|1.00|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 8UC1)|158.827|33.462|4.75|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 32FC1)|273.503|273.668|1.00|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 8UC3)|484.175|100.520|4.82|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 32FC3)|828.758|829.689|1.00|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 8UC4)|648.592|137.195|4.73|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 32FC4)|1116.755|1109.587|1.01|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 8UC1)|648.715|134.875|4.81|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 32FC1)|1115.939|1113.818|1.00|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 8UC3)|1944.791|413.420|4.70|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 32FC3)|3354.193|3324.672|1.01|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 8UC4)|2594.585|553.486|4.69|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 32FC4)|4473.543|4438.453|1.01|

**Bitwise Operation**

Geometric mean (ms)

|Name of Test|opencv perf core Bit|opencv perf core Bit|opencv perf core Bit vs opencv perf core Bit (x-factor)|
|---|:-:|:-:|:-:|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 8UC1)|22.542|4.971|4.53|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 32FC1)|90.210|19.917|4.53|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 8UC3)|68.429|15.037|4.55|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 32FC3)|280.168|59.239|4.73|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 8UC4)|90.565|19.735|4.59|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 32FC4)|374.695|79.257|4.73|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 8UC1)|67.824|14.873|4.56|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 32FC1)|279.514|59.232|4.72|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 8UC3)|208.337|44.234|4.71|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 32FC3)|851.211|182.522|4.66|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 8UC4)|279.529|59.095|4.73|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 32FC4)|1132.065|244.877|4.62|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 8UC1)|155.685|33.078|4.71|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 32FC1)|635.253|137.482|4.62|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 8UC3)|474.494|100.166|4.74|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 32FC3)|1907.340|412.841|4.62|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 8UC4)|635.538|134.544|4.72|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 32FC4)|2552.666|556.397|4.59|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 8UC1)|634.736|136.355|4.66|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 32FC1)|2548.283|561.827|4.54|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 8UC3)|1911.454|421.571|4.53|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 32FC3)|7663.803|1677.289|4.57|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 8UC4)|2543.983|562.780|4.52|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 32FC4)|10211.693|2237.393|4.56|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 8UC1)|22.341|4.811|4.64|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 32FC1)|89.975|19.288|4.66|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 8UC3)|67.237|14.643|4.59|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 32FC3)|276.324|58.609|4.71|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 8UC4)|89.587|19.554|4.58|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 32FC4)|370.986|77.136|4.81|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 8UC1)|67.227|14.541|4.62|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 32FC1)|276.357|58.076|4.76|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 8UC3)|206.752|43.376|4.77|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 32FC3)|841.638|177.787|4.73|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 8UC4)|276.773|57.784|4.79|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 32FC4)|1127.740|237.472|4.75|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 8UC1)|153.808|32.531|4.73|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 32FC1)|627.765|129.990|4.83|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 8UC3)|469.799|98.249|4.78|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 32FC3)|1893.591|403.694|4.69|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 8UC4)|627.724|129.962|4.83|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 32FC4)|2529.967|540.744|4.68|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 8UC1)|628.089|130.277|4.82|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 32FC1)|2521.817|540.146|4.67|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 8UC3)|1905.004|404.704|4.71|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 32FC3)|7567.971|1627.898|4.65|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 8UC4)|2531.476|540.181|4.69|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 32FC4)|10075.594|2181.654|4.62|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 8UC1)|22.566|5.076|4.45|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 32FC1)|90.391|19.928|4.54|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 8UC3)|67.758|14.740|4.60|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 32FC3)|279.253|59.844|4.67|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 8UC4)|90.296|19.802|4.56|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 32FC4)|373.972|79.815|4.69|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 8UC1)|67.815|14.865|4.56|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 32FC1)|279.398|60.054|4.65|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 8UC3)|208.643|45.043|4.63|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 32FC3)|850.042|180.985|4.70|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 8UC4)|279.363|60.385|4.63|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 32FC4)|1134.858|243.062|4.67|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 8UC1)|155.212|33.155|4.68|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 32FC1)|634.985|134.911|4.71|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 8UC3)|474.648|100.407|4.73|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 32FC3)|1912.049|414.184|4.62|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 8UC4)|635.252|132.587|4.79|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 32FC4)|2544.471|560.737|4.54|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 8UC1)|634.574|134.966|4.70|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 32FC1)|2545.129|561.498|4.53|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 8UC3)|1910.900|419.365|4.56|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 32FC3)|7662.603|1685.812|4.55|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 8UC4)|2548.971|560.787|4.55|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 32FC4)|10201.407|2237.552|4.56|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 8UC1)|22.718|4.961|4.58|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 32FC1)|91.496|19.831|4.61|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 8UC3)|67.910|15.151|4.48|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 32FC3)|279.612|59.792|4.68|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 8UC4)|91.073|19.853|4.59|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 32FC4)|374.641|79.155|4.73|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 8UC1)|67.704|15.008|4.51|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 32FC1)|279.229|60.088|4.65|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 8UC3)|208.156|44.426|4.69|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 32FC3)|849.501|180.848|4.70|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 8UC4)|279.642|59.728|4.68|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 32FC4)|1129.826|242.880|4.65|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 8UC1)|155.585|33.354|4.66|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 32FC1)|634.090|134.995|4.70|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 8UC3)|474.931|99.598|4.77|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 32FC3)|1910.519|413.138|4.62|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 8UC4)|635.026|135.155|4.70|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 32FC4)|2560.167|560.838|4.56|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 8UC1)|634.893|134.883|4.71|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 32FC1)|2548.166|560.831|4.54|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 8UC3)|1911.392|419.816|4.55|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 32FC3)|7646.634|1677.988|4.56|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 8UC4)|2560.637|560.805|4.57|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 32FC4)|10227.044|2249.458|4.55|

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 3rdparty/ndsrvp/CMakeLists.txt                |  34 ++
 3rdparty/ndsrvp/include/core.hpp              | 532 ++++++++++++++++++
 3rdparty/ndsrvp/include/features2d.hpp        |   8 +
 3rdparty/ndsrvp/include/imgproc.hpp           |  71 +++
 3rdparty/ndsrvp/ndsrvp_hal.hpp                |  15 +
 3rdparty/ndsrvp/src/integral.cpp              | 210 +++++++
 3rdparty/ndsrvp/src/threshold.cpp             | 177 ++++++
 3rdparty/ndsrvp/src/warpAffine.cpp            | 153 +++++
 3rdparty/ndsrvp/src/warpPerspective.cpp       | 159 ++++++
 CMakeLists.txt                                |  17 +
 .../linux/riscv64-andes-gcc.toolchain.cmake   |  15 +
 11 files changed, 1391 insertions(+)
 create mode 100644 3rdparty/ndsrvp/CMakeLists.txt
 create mode 100644 3rdparty/ndsrvp/include/core.hpp
 create mode 100644 3rdparty/ndsrvp/include/features2d.hpp
 create mode 100644 3rdparty/ndsrvp/include/imgproc.hpp
 create mode 100644 3rdparty/ndsrvp/ndsrvp_hal.hpp
 create mode 100644 3rdparty/ndsrvp/src/integral.cpp
 create mode 100644 3rdparty/ndsrvp/src/threshold.cpp
 create mode 100644 3rdparty/ndsrvp/src/warpAffine.cpp
 create mode 100644 3rdparty/ndsrvp/src/warpPerspective.cpp

diff --git a/3rdparty/ndsrvp/CMakeLists.txt b/3rdparty/ndsrvp/CMakeLists.txt
new file mode 100644
index 0000000000..bc9a3a26dc
--- /dev/null
+++ b/3rdparty/ndsrvp/CMakeLists.txt
@@ -0,0 +1,34 @@
+message(STATUS "##########")
+message(STATUS "# NDSRVP #")
+message(STATUS "##########")
+
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
+
+# project setup
+
+set(NDSRVP_INCLUDE_DIR include)
+set(NDSRVP_SOURCE_DIR src)
+
+file(GLOB ndsrvp_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${NDSRVP_INCLUDE_DIR}/*.hpp")
+file(GLOB ndsrvp_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${NDSRVP_SOURCE_DIR}/*.cpp")
+
+add_library(ndsrvp_hal STATIC)
+target_sources(ndsrvp_hal PRIVATE ${ndsrvp_headers} ${ndsrvp_sources})
+
+set_target_properties(ndsrvp_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(ndsrvp_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(ndsrvp_hal PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/modules/core/include
+  ${CMAKE_SOURCE_DIR}/modules/imgproc/include
+  ${CMAKE_SOURCE_DIR}/modules/features2d/include)
+
+# project info
+
+set(NDSRVP_HAL_FOUND TRUE CACHE INTERNAL "")
+set(NDSRVP_HAL_VERSION "0.0.1" CACHE INTERNAL "")
+set(NDSRVP_HAL_LIBRARIES "ndsrvp_hal" CACHE INTERNAL "")
+set(NDSRVP_HAL_HEADERS "ndsrvp_hal.hpp" CACHE INTERNAL "")
+set(NDSRVP_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "")
diff --git a/3rdparty/ndsrvp/include/core.hpp b/3rdparty/ndsrvp/include/core.hpp
new file mode 100644
index 0000000000..190a1b926b
--- /dev/null
+++ b/3rdparty/ndsrvp/include/core.hpp
@@ -0,0 +1,532 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_CORE_HPP
+#define OPENCV_NDSRVP_CORE_HPP
+
+namespace cv {
+
+namespace ndsrvp {
+
+template <typename srctype, typename dsttype,
+    typename vsrctype, typename vdsttype, int nlane,
+    template <typename src, typename dst> typename operators_t,
+    typename... params_t>
+int elemwise_binop(const srctype* src1_data, size_t src1_step,
+    const srctype* src2_data, size_t src2_step,
+    dsttype* dst_data, size_t dst_step,
+    int width, int height, params_t... params)
+{
+    src1_step /= sizeof(srctype);
+    src2_step /= sizeof(srctype);
+    dst_step /= sizeof(dsttype);
+
+    operators_t<srctype, dsttype> operators;
+
+    int i, j;
+    for (i = 0; i < height; ++i) {
+        const srctype* src1_row = src1_data + (src1_step * i);
+        const srctype* src2_row = src2_data + (src2_step * i);
+        dsttype* dst_row = dst_data + (dst_step * i);
+
+        j = 0;
+        for (; j + nlane <= width; j += nlane) {
+            register vsrctype vs1 = *(vsrctype*)(src1_row + j);
+            register vsrctype vs2 = *(vsrctype*)(src2_row + j);
+
+            *(vdsttype*)(dst_row + j) = operators.vector(vs1, vs2, params...);
+        }
+        for (; j < width; j++)
+            dst_row[j] = operators.scalar(src1_row[j], src2_row[j], params...);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename srctype, typename dsttype,
+    typename vsrctype, typename vdsttype, int nlane,
+    template <typename src, typename dst> typename operators_t,
+    typename... params_t>
+int elemwise_unop(const srctype* src_data, size_t src_step,
+    dsttype* dst_data, size_t dst_step,
+    int width, int height, params_t... params)
+{
+    src_step /= sizeof(srctype);
+    dst_step /= sizeof(dsttype);
+
+    operators_t<srctype, dsttype> operators;
+
+    int i, j;
+    for (i = 0; i < height; ++i) {
+        const srctype* src_row = src_data + (src_step * i);
+        dsttype* dst_row = dst_data + (dst_step * i);
+
+        j = 0;
+        for (; j + nlane <= width; j += nlane) {
+            register vsrctype vs = *(vsrctype*)(src_row + j);
+
+            *(vdsttype*)(dst_row + j) = operators.vector(vs, params...);
+        }
+        for (; j < width; j++)
+            dst_row[j] = operators.scalar(src_row[j], params...);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// ################ add ################
+
+template <typename src, typename dst>
+struct operators_add_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_ukadd8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__ukadd8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_kadd8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__kadd8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_ukadd16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__ukadd16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_kadd16(a, b); }
+    inline short scalar(short a, short b) { return __nds__kadd16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_kadd32(a, b); }
+    inline int scalar(int a, int b) { return __nds__kadd32(a, b); }
+};
+
+#undef cv_hal_add8u
+#define cv_hal_add8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add8s
+#define cv_hal_add8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add16u
+#define cv_hal_add16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add16s
+#define cv_hal_add16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add32s
+#define cv_hal_add32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_add_t>)
+
+// ################ sub ################
+
+template <typename src, typename dst>
+struct operators_sub_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_uksub8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__uksub8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_ksub8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__ksub8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_uksub16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__uksub16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_ksub16(a, b); }
+    inline short scalar(short a, short b) { return __nds__ksub16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_ksub32(a, b); }
+    inline int scalar(int a, int b) { return __nds__ksub32(a, b); }
+};
+
+#undef cv_hal_sub8u
+#define cv_hal_sub8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub8s
+#define cv_hal_sub8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub16u
+#define cv_hal_sub16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub16s
+#define cv_hal_sub16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub32s
+#define cv_hal_sub32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_sub_t>)
+
+// ################ max ################
+
+template <typename src, typename dst>
+struct operators_max_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_umax8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__umax8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_smax8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__smax8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_umax16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__umax16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_smax16(a, b); }
+    inline short scalar(short a, short b) { return __nds__smax16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_smax32(a, b); }
+    inline int scalar(int a, int b) { return __nds__smax32(a, b); }
+};
+
+#undef cv_hal_max8u
+#define cv_hal_max8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max8s
+#define cv_hal_max8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max16u
+#define cv_hal_max16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max16s
+#define cv_hal_max16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max32s
+#define cv_hal_max32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_max_t>)
+
+// ################ min ################
+
+template <typename src, typename dst>
+struct operators_min_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_umin8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__umin8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_smin8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__smin8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_umin16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__umin16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_smin16(a, b); }
+    inline short scalar(short a, short b) { return __nds__smin16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_smin32(a, b); }
+    inline int scalar(int a, int b) { return __nds__smin32(a, b); }
+};
+
+#undef cv_hal_min8u
+#define cv_hal_min8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min8s
+#define cv_hal_min8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min16u
+#define cv_hal_min16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min16s
+#define cv_hal_min16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min32s
+#define cv_hal_min32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_min_t>)
+
+// ################ absdiff ################
+
+template <typename src, typename dst>
+struct operators_absdiff_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_uksub8(__nds__v_umax8(a, b), __nds__v_umin8(a, b)); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__uksub8(__nds__umax8(a, b), __nds__umin8(a, b)); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_ksub8(__nds__v_smax8(a, b), __nds__v_smin8(a, b)); }
+    inline schar scalar(schar a, schar b) { return __nds__ksub8(__nds__smax8(a, b), __nds__smin8(a, b)); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_uksub16(__nds__v_umax16(a, b), __nds__v_umin16(a, b)); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__uksub16(__nds__umax16(a, b), __nds__umin16(a, b)); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_ksub16(__nds__v_smax16(a, b), __nds__v_smin16(a, b)); }
+    inline short scalar(short a, short b) { return __nds__ksub16(__nds__smax16(a, b), __nds__smin16(a, b)); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_ksub32(__nds__v_smax32(a, b), __nds__v_smin32(a, b)); }
+    inline int scalar(int a, int b) { return __nds__ksub32(__nds__smax32(a, b), __nds__smin32(a, b)); }
+};
+
+#undef cv_hal_absdiff8u
+#define cv_hal_absdiff8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff8s
+#define cv_hal_absdiff8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff16u
+#define cv_hal_absdiff16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff16s
+#define cv_hal_absdiff16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff32s
+#define cv_hal_absdiff32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_absdiff_t>)
+
+// ################ bitwise ################
+
+template <typename src, typename dst>
+struct operators_and_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a & b; }
+    inline uchar scalar(uchar a, uchar b) { return a & b; }
+};
+
+#undef cv_hal_and8u
+#define cv_hal_and8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_and_t>)
+
+template <typename src, typename dst>
+struct operators_or_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a | b; }
+    inline uchar scalar(uchar a, uchar b) { return a | b; }
+};
+
+#undef cv_hal_or8u
+#define cv_hal_or8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_or_t>)
+
+template <typename src, typename dst>
+struct operators_xor_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a ^ b; }
+    inline uchar scalar(uchar a, uchar b) { return a ^ b; }
+};
+
+#undef cv_hal_xor8u
+#define cv_hal_xor8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_xor_t>)
+
+template <typename src, typename dst>
+struct operators_not_t {
+    inline uint8x8_t vector(uint8x8_t a) { return ~a; }
+    inline uchar scalar(uchar a) { return ~a; }
+};
+
+#undef cv_hal_not8u
+#define cv_hal_not8u (cv::ndsrvp::elemwise_unop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_not_t>)
+
+// ################ cmp ################
+
+template <typename src, typename dst>
+struct operators_cmp_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__v_ucmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__v_ucmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__v_ucmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__v_ucmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__v_ucmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__v_ucmpeq8(a, b);
+        default:
+            return uint8x8_t();
+        }
+    }
+    inline uchar scalar(uchar a, uchar b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__ucmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__ucmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__ucmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__ucmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq8(a, b);
+        default:
+            return 0;
+        }
+    }
+
+    inline uint8x8_t vector(int8x8_t a, int8x8_t b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__v_scmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__v_scmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__v_scmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__v_scmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__v_scmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__v_scmpeq8(a, b);
+        default:
+            return uint8x8_t();
+        }
+    }
+    inline uchar scalar(schar a, schar b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__scmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__scmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__scmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__scmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq8(a, b);
+        default:
+            return 0;
+        }
+    }
+
+    inline uint8x4_t vector(uint16x4_t a, uint16x4_t b, int operation)
+    {
+        register unsigned long cmp;
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            cmp = (unsigned long)__nds__v_ucmpeq16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_GT:
+            cmp = (unsigned long)__nds__v_ucmplt16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_GE:
+            cmp = (unsigned long)__nds__v_ucmple16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_LT:
+            cmp = (unsigned long)__nds__v_ucmplt16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_LE:
+            cmp = (unsigned long)__nds__v_ucmple16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_NE:
+            cmp = ~(unsigned long)__nds__v_ucmpeq16(a, b) >> 8;
+            break;
+        default:
+            return uint8x4_t();
+        }
+        return (uint8x4_t)(unsigned int)__nds__pkbb16(cmp >> 32, cmp);
+    }
+    inline uchar scalar(ushort a, ushort b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq16(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__ucmplt16(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__ucmple16(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__ucmplt16(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__ucmple16(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq16(a, b);
+        default:
+            return 0;
+        }
+    }
+
+    inline uint8x4_t vector(int16x4_t a, int16x4_t b, int operation)
+    {
+        register unsigned long cmp;
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            cmp = (unsigned long)__nds__v_scmpeq16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_GT:
+            cmp = (unsigned long)__nds__v_scmplt16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_GE:
+            cmp = (unsigned long)__nds__v_scmple16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_LT:
+            cmp = (unsigned long)__nds__v_scmplt16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_LE:
+            cmp = (unsigned long)__nds__v_scmple16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_NE:
+            cmp = ~(unsigned long)__nds__v_scmpeq16(a, b) >> 8;
+            break;
+        default:
+            return uint8x4_t();
+        }
+        return (uint8x4_t)(unsigned int)__nds__pkbb16(cmp >> 32, cmp);
+    }
+    inline uchar scalar(short a, short b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq16(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__scmplt16(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__scmple16(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__scmplt16(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__scmple16(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq16(a, b);
+        default:
+            return 0;
+        }
+    }
+};
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_cmp_t>)
+
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s (cv::ndsrvp::elemwise_binop<schar, uchar, int8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_cmp_t>)
+
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u (cv::ndsrvp::elemwise_binop<ushort, uchar, uint16x4_t, uint8x4_t, 4, cv::ndsrvp::operators_cmp_t>)
+
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s (cv::ndsrvp::elemwise_binop<short, uchar, int16x4_t, uint8x4_t, 4, cv::ndsrvp::operators_cmp_t>)
+
+// ################ split ################
+
+/*template <typename srctype, typename vsrctype, int nlane>
+int split(const srctype* src_data, srctype** dst_data, int len, int cn)
+{
+    int i, j;
+    for (i = 0; i < len; i++) {
+        for (j = 0; j < cn; j++) {
+            dst_data[j][i] = src_data[i * cn + j];
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_split8u
+#define cv_hal_split8u (cv::ndsrvp::split<uchar, uint8x8_t, 8>)
+
+#undef cv_hal_split16u
+#define cv_hal_split16u (cv::ndsrvp::split<ushort, uint16x4_t, 4>)
+
+#undef cv_hal_split32s
+#define cv_hal_split32s (cv::ndsrvp::split<int, int32x2_t, 2>)*/
+
+// ################ merge ################
+
+/*template <typename srctype, typename vsrctype, int nlane>
+int merge(const srctype** src_data, srctype* dst_data, int len, int cn)
+{
+    int i, j;
+    for (i = 0; i < len; i++) {
+        for (j = 0; j < cn; j++) {
+            dst_data[i * cn + j] = src_data[j][i];
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u (cv::ndsrvp::merge<uchar, uint8x8_t, 8>)
+
+#undef cv_hal_merge16u
+#define cv_hal_merge16u (cv::ndsrvp::merge<ushort, uint16x4_t, 4>)
+
+#undef cv_hal_merge32s
+#define cv_hal_merge32s (cv::ndsrvp::merge<int, int32x2_t, 2>)*/
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
diff --git a/3rdparty/ndsrvp/include/features2d.hpp b/3rdparty/ndsrvp/include/features2d.hpp
new file mode 100644
index 0000000000..1f6180a795
--- /dev/null
+++ b/3rdparty/ndsrvp/include/features2d.hpp
@@ -0,0 +1,8 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_FEATURES2D_HPP
+#define OPENCV_NDSRVP_FEATURES2D_HPP
+
+#endif
diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp
new file mode 100644
index 0000000000..3a572172a8
--- /dev/null
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_IMGPROC_HPP
+#define OPENCV_NDSRVP_IMGPROC_HPP
+
+namespace cv {
+
+// ################ remap ################
+
+void remap(InputArray _src, OutputArray _dst,
+    InputArray _map1, InputArray _map2,
+    int interpolation, int borderType, const Scalar& borderValue);
+
+namespace ndsrvp {
+
+enum InterpolationMasks {
+    INTER_BITS = 5,
+    INTER_BITS2 = INTER_BITS * 2,
+    INTER_TAB_SIZE = 1 << INTER_BITS,
+    INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
+};
+
+// ################ integral ################
+
+int integral(int depth, int sdepth, int sqdepth,
+    const uchar* src, size_t _srcstep,
+    uchar* sum, size_t _sumstep,
+    uchar* sqsum, size_t,
+    uchar* tilted, size_t,
+    int width, int height, int cn);
+
+#undef cv_hal_integral
+#define cv_hal_integral (cv::ndsrvp::integral)
+
+// ################ warpAffine ################
+
+int warpAffine(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[6], int interpolation, int borderType, const double borderValue[4]);
+
+#undef cv_hal_warpAffine
+#define cv_hal_warpAffine (cv::ndsrvp::warpAffine)
+
+// ################ warpPerspective ################
+
+int warpPerspective(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+#undef cv_hal_warpPerspective
+#define cv_hal_warpPerspective (cv::ndsrvp::warpPerspective)
+
+// ################ threshold ################
+
+int threshold(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn,
+    double thresh, double maxValue, int thresholdType);
+
+#undef cv_hal_threshold
+#define cv_hal_threshold (cv::ndsrvp::threshold)
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
diff --git a/3rdparty/ndsrvp/ndsrvp_hal.hpp b/3rdparty/ndsrvp/ndsrvp_hal.hpp
new file mode 100644
index 0000000000..7f12636520
--- /dev/null
+++ b/3rdparty/ndsrvp/ndsrvp_hal.hpp
@@ -0,0 +1,15 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_HAL_HPP
+#define OPENCV_NDSRVP_HAL_HPP
+
+#include "opencv2/core/mat.hpp"
+#include <nds_intrinsic.h>
+
+#include "include/core.hpp"
+#include "include/imgproc.hpp"
+#include "include/features2d.hpp"
+
+#endif
diff --git a/3rdparty/ndsrvp/src/integral.cpp b/3rdparty/ndsrvp/src/integral.cpp
new file mode 100644
index 0000000000..37030a8d4c
--- /dev/null
+++ b/3rdparty/ndsrvp/src/integral.cpp
@@ -0,0 +1,210 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+int integral(int depth, int sdepth, int sqdepth,
+    const uchar* src, size_t _srcstep,
+    uchar* _sum, size_t _sumstep,
+    uchar* _sqsum, size_t,
+    uchar* _tilted, size_t,
+    int width, int height, int cn)
+{
+    // 8-bit unsigned integer, 32-bit signed integer only
+    if (!(depth == CV_8U && sdepth == CV_32S))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    // too small image
+    if (!(width >> 8 || height >> 8 || cn == 4))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int* sum = (int*)_sum;
+    double* sqsum = (double*)_sqsum;
+    int* tilted = (int*)_tilted;
+
+    if (sqsum || tilted || cn > 4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    sqdepth = sqdepth;
+    width *= cn;
+
+    memset(sum, 0, (width + cn) * sizeof(int));
+
+    if (cn == 1) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = 0;
+
+            int32x2_t prev = { 0, 0 };
+            int j = 0;
+
+            for (; j + 8 <= width; j += 8) {
+                unsigned long vs8x8 = *(unsigned long*)(src_row + j); 
+
+                unsigned long vs810 = __nds__zunpkd810(vs8x8);
+                unsigned long vs832 = __nds__zunpkd832(vs8x8);
+
+                int16x4_t vs16x4 = (int16x4_t)__nds__pkbb32(vs832, vs810);
+
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 16); // gcc vector extension
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 32); // '+' is add16
+
+                //*(int32x2_t*)(sum_row + j) = (int32x2_t) { vs16x4[0], vs16x4[1] } + *(int32x2_t*)(prev_sum_row + j) + prev;
+                //*(int32x2_t*)(sum_row + j + 2) = (int32x2_t) { vs16x4[2], vs16x4[3] } + *(int32x2_t*)(prev_sum_row + j + 2) + prev;
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                sum_row[j] = prev_sum_row[j] + prev[0] + vs16x4[0];
+                sum_row[j + 1] = prev_sum_row[j + 1] + prev[1] + vs16x4[1];
+                sum_row[j + 2] = prev_sum_row[j + 2] + prev[0] + vs16x4[2];
+                sum_row[j + 3] = prev_sum_row[j + 3] + prev[1] + vs16x4[3];
+
+                prev += vs16x4[3]; // prev += (int32x2_t){vs16x4[3], vs16x4[3]};
+
+                vs16x4 = (int16x4_t)__nds__pktt32(vs832, vs810);
+
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 16);
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 32);
+
+                //*(int32x2_t*)(sum_row + j + 4) = (int32x2_t) { vs16x4[0], vs16x4[1] } + *(int32x2_t*)(prev_sum_row + j + 4) + prev;
+                //*(int32x2_t*)(sum_row + j + 6) = (int32x2_t) { vs16x4[2], vs16x4[3] } + *(int32x2_t*)(prev_sum_row + j + 6) + prev;
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                sum_row[j + 4] = prev_sum_row[j + 4] + prev[0] + vs16x4[0];
+                sum_row[j + 5] = prev_sum_row[j + 5] + prev[1] + vs16x4[1];
+                sum_row[j + 6] = prev_sum_row[j + 6] + prev[0] + vs16x4[2];
+                sum_row[j + 7] = prev_sum_row[j + 7] + prev[1] + vs16x4[3];
+
+                prev += vs16x4[3];
+            }
+
+            for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
+                sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
+        }
+    } else if (cn == 2) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = sum_row[-2] = 0;
+
+            int32x2_t prev = { 0, 0 };
+            int j = 0;
+            for (; j + 8 <= width; j += 8) {
+                uint8x8_t vs8x8 = *(uint8x8_t*)(src_row + j);
+
+                uint16x4_t vs16x4_1 = __nds__v_zunpkd820(vs8x8);
+                uint16x4_t vs16x4_2 = __nds__v_zunpkd831(vs8x8);
+
+                vs16x4_1 += (int16x4_t)((unsigned long)vs16x4_1 << 16);
+                vs16x4_1 += (int16x4_t)((unsigned long)vs16x4_1 << 32);
+
+                vs16x4_2 += (int16x4_t)((unsigned long)vs16x4_2 << 16);
+                vs16x4_2 += (int16x4_t)((unsigned long)vs16x4_2 << 32);
+
+                *(int32x2_t*)(sum_row + j) = (int32x2_t) { vs16x4_1[0], vs16x4_2[0] } + *(int32x2_t*)(prev_sum_row + j) + prev;
+                *(int32x2_t*)(sum_row + j + 2) = (int32x2_t) { vs16x4_1[1], vs16x4_2[1] } + *(int32x2_t*)(prev_sum_row + j + 2) + prev;
+                *(int32x2_t*)(sum_row + j + 2 * 2) = (int32x2_t) { vs16x4_1[2], vs16x4_2[2] } + *(int32x2_t*)(prev_sum_row + j + 2 * 2) + prev;
+                *(int32x2_t*)(sum_row + j + 2 * 3) = (int32x2_t) { vs16x4_1[3], vs16x4_2[3] } + *(int32x2_t*)(prev_sum_row + j + 2 * 3) + prev;
+
+                prev += (int32x2_t) { vs16x4_1[3], vs16x4_2[3] };
+            }
+
+            for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
+                     v1 = sum_row[j - 2] - prev_sum_row[j - 2];
+                 j < width; j += 2) {
+                sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
+                sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
+            }
+        }
+    } else if (cn == 3) {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        /* disabled because of unaligned memory access, difficulty in vectorization, etc.
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
+
+            int32x2_t prev_ptr[2] = { { 0, 0 }, { 0, 0 } };
+            int j = 0;
+            for (; j + 3 <= width; j += 3) {
+                //uint8x4_t vs8x4 = *(uint8x4_t*)(src_row + j);
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                uint8x4_t vs8x4 = (uint8x4_t){ src_row[j], src_row[j + 1], src_row[j + 2], 0};
+
+                // [ 0 | 2 | 1 | 3 ]
+                int16x4_t vs16x4 = (int16x4_t)__nds__pkbb32(__nds__zunpkd831((unsigned int)vs8x4), __nds__zunpkd820((unsigned int)vs8x4));
+
+                // [ b | t | b | t ]
+                prev_ptr[0] += (int32x2_t)__nds__pkbb16(0, (unsigned long)vs16x4);
+                prev_ptr[1] += (int32x2_t)__nds__pktt16(0, (unsigned long)vs16x4);
+
+                //*(int32x4_t*)(sum_row + j) = *(int32x4_t*)(prev_sum_row + j) + *(int32x4_t*)prev_ptr;
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                sum_row[j] = prev_sum_row[j] + prev_ptr[0][0];
+                sum_row[j + 1] = prev_sum_row[j + 1] + prev_ptr[0][1];
+                sum_row[j + 2] = prev_sum_row[j + 2] + prev_ptr[1][0];
+            }
+
+            for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
+                     v2 = sum_row[j - 2] - prev_sum_row[j - 2],
+                     v1 = sum_row[j - 3] - prev_sum_row[j - 3];
+                 j < width; j += 3) {
+                sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
+                sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
+                sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
+            }
+        }*/
+    } else if (cn == 4) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0;
+
+            int32x2_t prev_ptr[2] = { { 0, 0 }, { 0, 0 } };
+            int j = 0;
+            for (; j + 4 <= width; j += 4) {
+                uint8x4_t vs8x4 = *(uint8x4_t*)(src_row + j);
+
+                // [ 0 | 2 | 1 | 3 ]
+                int16x4_t vs16x4 = (int16x4_t)__nds__pkbb32(__nds__zunpkd831((unsigned int)vs8x4), __nds__zunpkd820((unsigned int)vs8x4));
+
+                // [ b | t | b | t ]
+                prev_ptr[0] += (int32x2_t)__nds__pkbb16(0, (unsigned long)vs16x4);
+                prev_ptr[1] += (int32x2_t)__nds__pktt16(0, (unsigned long)vs16x4);
+
+                *(int32x4_t*)(sum_row + j) = *(int32x4_t*)(prev_sum_row + j) + *(int32x4_t*)prev_ptr;
+            }
+
+            for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
+                     v3 = sum_row[j - 2] - prev_sum_row[j - 2],
+                     v2 = sum_row[j - 3] - prev_sum_row[j - 3],
+                     v1 = sum_row[j - 4] - prev_sum_row[j - 4];
+                 j < width; j += 4) {
+                sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
+                sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
+                sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
+                sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3];
+            }
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/threshold.cpp b/3rdparty/ndsrvp/src/threshold.cpp
new file mode 100644
index 0000000000..06de591fef
--- /dev/null
+++ b/3rdparty/ndsrvp/src/threshold.cpp
@@ -0,0 +1,177 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv {
+
+namespace ndsrvp {
+
+template <typename type, typename vtype>
+class operators_threshold_t {
+public:
+    virtual ~operators_threshold_t() {};
+    virtual inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
+    {
+        (void)src;
+        (void)thresh;
+        (void)maxval;
+        CV_Error(cv::Error::StsBadArg, "");
+        return vtype();
+    }
+    virtual inline type scalar(const type& src, const type& thresh, const type& maxval)
+    {
+        (void)src;
+        (void)thresh;
+        (void)maxval;
+        CV_Error(cv::Error::StsBadArg, "");
+        return type();
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshBinary : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        return (vtype)__nds__bpick((long)maxval, (long)0, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        return src > thresh ? maxval : 0;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshBinaryInv : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        return (vtype)__nds__bpick((long)0, (long)maxval, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        return src > thresh ? 0 : maxval;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshTrunc : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        (void)maxval;
+        return (vtype)__nds__bpick((long)thresh, (long)src, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        (void)maxval;
+        return src > thresh ? thresh : src;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshToZero : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        (void)maxval;
+        return (vtype)__nds__bpick((long)src, (long)0, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        (void)maxval;
+        return src > thresh ? src : 0;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshToZeroInv : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        (void)maxval;
+        return (vtype)__nds__bpick((long)0, (long)src, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        (void)maxval;
+        return src > thresh ? 0 : src;
+    }
+};
+
+template <typename type, typename vtype, int nlane>
+static void threshold_op(const type* src_data, size_t src_step,
+    type* dst_data, size_t dst_step,
+    int width, int height, int cn,
+    type thresh, type maxval, int thtype)
+{
+    int i, j;
+    width *= cn;
+    src_step /= sizeof(type);
+    dst_step /= sizeof(type);
+    vtype vthresh;
+    vtype vmaxval;
+    for (i = 0; i < nlane; i++) {
+        vthresh[i] = thresh;
+        vmaxval[i] = maxval;
+    }
+
+    operators_threshold_t<type, vtype>* op;
+    switch (thtype) {
+    case CV_HAL_THRESH_BINARY:
+        op = new opThreshBinary<type, vtype>();
+        break;
+    case CV_HAL_THRESH_BINARY_INV:
+        op = new opThreshBinaryInv<type, vtype>();
+        break;
+    case CV_HAL_THRESH_TRUNC:
+        op = new opThreshTrunc<type, vtype>();
+        break;
+    case CV_HAL_THRESH_TOZERO:
+        op = new opThreshToZero<type, vtype>();
+        break;
+    case CV_HAL_THRESH_TOZERO_INV:
+        op = new opThreshToZeroInv<type, vtype>();
+        break;
+    default:
+        CV_Error(cv::Error::StsBadArg, "");
+        return;
+    }
+
+    for (i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) {
+        for (j = 0; j <= width - nlane; j += nlane) {
+            vtype vs = *(vtype*)(src_data + j);
+            *(vtype*)(dst_data + j) = op->vector(vs, vthresh, vmaxval);
+        }
+        for (; j < width; j++) {
+            dst_data[j] = op->scalar(src_data[j], thresh, maxval);
+        }
+    }
+
+    delete op;
+    return;
+}
+
+int threshold(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn,
+    double thresh, double maxValue, int thresholdType)
+{
+    if (width <= 255 && height <= 255) // slower at small size
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (depth == CV_8U) {
+        threshold_op<uchar, uint8x8_t, 8>((uchar*)src_data, src_step, (uchar*)dst_data, dst_step, width, height, cn, (uchar)thresh, (uchar)maxValue, thresholdType);
+        return CV_HAL_ERROR_OK;
+    } else if (depth == CV_16S) {
+        threshold_op<short, int16x4_t, 4>((short*)src_data, src_step, (short*)dst_data, dst_step, width, height, cn, (short)thresh, (short)maxValue, thresholdType);
+        return CV_HAL_ERROR_OK;
+    } else if (depth == CV_16U) {
+        threshold_op<ushort, uint16x4_t, 4>((ushort*)src_data, src_step, (ushort*)dst_data, dst_step, width, height, cn, (ushort)thresh, (ushort)maxValue, thresholdType);
+        return CV_HAL_ERROR_OK;
+    } else {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/warpAffine.cpp b/3rdparty/ndsrvp/src/warpAffine.cpp
new file mode 100644
index 0000000000..d54e4dc237
--- /dev/null
+++ b/3rdparty/ndsrvp/src/warpAffine.cpp
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class WarpAffineInvoker : public ParallelLoopBody {
+public:
+    WarpAffineInvoker(const Mat& _src, Mat& _dst, int _interpolation, int _borderType,
+        const Scalar& _borderValue, int* _adelta, int* _bdelta, const double* _M)
+        : ParallelLoopBody()
+        , src(_src)
+        , dst(_dst)
+        , interpolation(_interpolation)
+        , borderType(_borderType)
+        , borderValue(_borderValue)
+        , adelta(_adelta)
+        , bdelta(_bdelta)
+        , M(_M)
+    {
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        const int BLOCK_SZ = 64;
+        AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
+        short *XY = __XY.data(), *A = __A.data();
+        const int AB_BITS = MAX(10, (int)INTER_BITS);
+        const int AB_SCALE = 1 << AB_BITS;
+        int round_delta = interpolation == CV_HAL_INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2, x, y, x1, y1;
+
+        int bh0 = std::min(BLOCK_SZ / 2, dst.rows);
+        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, dst.cols);
+        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, dst.rows);
+
+        for (y = range.start; y < range.end; y += bh0) {
+            for (x = 0; x < dst.cols; x += bw0) {
+                int bw = std::min(bw0, dst.cols - x);
+                int bh = std::min(bh0, range.end - y);
+
+                Mat _XY(bh, bw, CV_16SC2, XY);
+                Mat dpart(dst, Rect(x, y, bw, bh));
+
+                for (y1 = 0; y1 < bh; y1++) {
+                    short* xy = XY + y1 * bw * 2;
+                    int X0 = saturate_cast<int>((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta;
+                    int Y0 = saturate_cast<int>((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta;
+
+                    if (interpolation == CV_HAL_INTER_NEAREST) {
+                        x1 = 0;
+
+                        for (; x1 < bw; x1 += 2) {
+                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
+                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
+
+                            vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
+                            vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            int X = (X0 + adelta[x + x1]) >> AB_BITS;
+                            int Y = (Y0 + bdelta[x + x1]) >> AB_BITS;
+                            xy[x1 * 2] = saturate_cast<short>(X);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+                        }
+                    } else {
+                        short* alpha = A + y1 * bw;
+                        x1 = 0;
+
+                        const int INTER_MASK = INTER_TAB_SIZE - 1;
+                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+                        for (; x1 < bw; x1 += 2) {
+                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
+                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
+                            vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
+                            vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
+
+                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS);
+                            int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS);
+                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
+                        }
+                    }
+                }
+
+                if (interpolation == CV_HAL_INTER_NEAREST)
+                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
+                else {
+                    Mat _matA(bh, bw, CV_16U, A);
+                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
+                }
+            }
+        }
+    }
+
+private:
+    Mat src;
+    Mat dst;
+    int interpolation, borderType;
+    Scalar borderValue;
+    int *adelta, *bdelta;
+    const double* M;
+};
+
+int warpAffine(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[6], int interpolation, int borderType, const double borderValue[4])
+{
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+
+    int x;
+    AutoBuffer<int> _abdelta(dst.cols * 2);
+    int *adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    const int AB_SCALE = 1 << AB_BITS;
+
+    for (x = 0; x < dst.cols; x++) {
+        adelta[x] = saturate_cast<int>(M[0] * x * AB_SCALE);
+        bdelta[x] = saturate_cast<int>(M[3] * x * AB_SCALE);
+    }
+
+    Range range(0, dst.rows);
+    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
+        Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
+        adelta, bdelta, M);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/warpPerspective.cpp b/3rdparty/ndsrvp/src/warpPerspective.cpp
new file mode 100644
index 0000000000..b4fa423ed7
--- /dev/null
+++ b/3rdparty/ndsrvp/src/warpPerspective.cpp
@@ -0,0 +1,159 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class WarpPerspectiveInvoker : public ParallelLoopBody {
+public:
+    WarpPerspectiveInvoker(const Mat& _src, Mat& _dst, const double* _M, int _interpolation,
+        int _borderType, const Scalar& _borderValue)
+        : ParallelLoopBody()
+        , src(_src)
+        , dst(_dst)
+        , M(_M)
+        , interpolation(_interpolation)
+        , borderType(_borderType)
+        , borderValue(_borderValue)
+    {
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        const int BLOCK_SZ = 32;
+        short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ];
+        int x, y, y1, width = dst.cols, height = dst.rows;
+
+        int bh0 = std::min(BLOCK_SZ / 2, height);
+        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, width);
+        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, height);
+
+        for (y = range.start; y < range.end; y += bh0) {
+            for (x = 0; x < width; x += bw0) {
+                int bw = std::min(bw0, width - x);
+                int bh = std::min(bh0, range.end - y); // height
+
+                Mat _XY(bh, bw, CV_16SC2, XY);
+                Mat dpart(dst, Rect(x, y, bw, bh));
+
+                for (y1 = 0; y1 < bh; y1++) {
+                    short* xy = XY + y1 * bw * 2;
+                    double X0 = M[0] * x + M[1] * (y + y1) + M[2];
+                    double Y0 = M[3] * x + M[4] * (y + y1) + M[5];
+                    double W0 = M[6] * x + M[7] * (y + y1) + M[8];
+
+                    if (interpolation == CV_HAL_INTER_NEAREST) {
+                        int x1 = 0;
+
+                        for (; x1 < bw; x1 += 2) {
+                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+                            W1 = W1 ? 1. / W1 : 0;
+                            W2 = W2 ? 1. / W2 : 0;
+                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+                            vX = __nds__v_sclip32(vX, 15);
+                            vY = __nds__v_sclip32(vY, 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            double W = W0 + M[6] * x1;
+                            W = W ? 1. / W : 0;
+                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+                            int X = saturate_cast<int>(fX);
+                            int Y = saturate_cast<int>(fY);
+
+                            xy[x1 * 2] = saturate_cast<short>(X);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+                        }
+                    } else {
+                        short* alpha = A + y1 * bw;
+                        int x1 = 0;
+
+                        const int INTER_MASK = INTER_TAB_SIZE - 1;
+                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+                        for (; x1 < bw; x1 += 2) {
+                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+                            W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
+                            W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
+                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            double W = W0 + M[6] * x1;
+                            W = W ? INTER_TAB_SIZE / W : 0;
+                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+                            int X = saturate_cast<int>(fX);
+                            int Y = saturate_cast<int>(fY);
+
+                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
+                        }
+                    }
+                }
+
+                if (interpolation == CV_HAL_INTER_NEAREST)
+                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
+                else {
+                    Mat _matA(bh, bw, CV_16U, A);
+                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
+                }
+            }
+        }
+    }
+
+private:
+    Mat src;
+    Mat dst;
+    const double* M;
+    int interpolation, borderType;
+    Scalar borderValue;
+};
+
+int warpPerspective(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[9], int interpolation, int borderType, const double borderValue[4])
+{
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+
+    Range range(0, dst.rows);
+    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 136de5c79f..f5e39b4c71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,6 +263,8 @@ OCV_OPTION(WITH_CAROTENE "Use NVidia carotene acceleration library for ARM platf
   VISIBLE_IF (ARM OR AARCH64) AND NOT IOS AND NOT XROS)
 OCV_OPTION(WITH_KLEIDICV "Use KleidiCV library for ARM platforms" OFF
   VISIBLE_IF (AARCH64 AND (ANDROID OR UNIX AND NOT IOS AND NOT XROS)))
+OCV_OPTION(WITH_NDSRVP "Use Andes RVP extension" (NOT CV_DISABLE_OPTIMIZATION)
+  VISIBLE_IF RISCV)
 OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON
   VISIBLE_IF ANDROID
   VERIFY HAVE_CPUFEATURES)
@@ -985,6 +987,13 @@ if(WITH_CAROTENE)
   endif()
 endif()
 
+if(WITH_NDSRVP)
+  ocv_debug_message(STATUS "Andes RVP 3rdparty NDSRVP enabled")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";ndsrvp;")
+    set(OpenCV_HAL "ndsrvp;${OpenCV_HAL}")
+  endif()
+endif()
+
 foreach(hal ${OpenCV_HAL})
   if(hal STREQUAL "carotene")
     if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
@@ -1006,6 +1015,14 @@ foreach(hal ${OpenCV_HAL})
     add_subdirectory(3rdparty/kleidicv)
     ocv_hal_register(KLEIDICV_HAL_LIBRARIES KLEIDICV_HAL_HEADERS KLEIDICV_HAL_INCLUDE_DIRS)
     list(APPEND OpenCV_USED_HAL "KleidiCV (ver ${KLEIDICV_HAL_VERSION})")
+  elseif(hal STREQUAL "ndsrvp")
+    if(CMAKE_C_FLAGS MATCHES "-mext-dsp" AND CMAKE_CXX_FLAGS MATCHES "-mext-dsp" AND NOT ";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
+      add_subdirectory(3rdparty/ndsrvp)
+      ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
+      list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
+    else()
+      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not open, disabling ndsrvp...")
+    endif()
   elseif(hal STREQUAL "openvx")
     add_subdirectory(3rdparty/openvx)
     ocv_hal_register(OPENVX_HAL_LIBRARIES OPENVX_HAL_HEADERS OPENVX_HAL_INCLUDE_DIRS)
diff --git a/platforms/linux/riscv64-andes-gcc.toolchain.cmake b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
index ce733fc790..9b9c0b5246 100755
--- a/platforms/linux/riscv64-andes-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
@@ -1,10 +1,25 @@
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR riscv64)
 
+message(STATUS "RISCV: $ENV{RISCV}")
+message(STATUS "RISCV_GCC_INSTALL_ROOT: $ENV{RISCV_GCC_INSTALL_ROOT}")
+
 set(RISCV_GCC_INSTALL_ROOT $ENV{RISCV} CACHE PATH "Path to GCC for RISC-V cross compiler installation directory")
 
 set(CMAKE_C_COMPILER  ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)
 set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)
 
+# fix toolchain macro
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ANDES=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ANDES=1")
+
+# enable rvp
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")
+
+# fix segment address
+
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000")