Merge pull request #22275 from zihaomu:fp16_support_conv

DNN: FP16 support on Convolution 2D #22275 

## FP16 support on ARM platform
This PR proposes to support FP16 backend in Convolution.
For now, we only support FP16 at ARM aarch64.

In addition to adding fp16, I also added `seperateIm2col` optimization in this patch.

## How to use FP16 to speed up convolution?
```
Net net = readNet(modelPath);
net.setPreferableTarget(DNN_TARGET_CPU_FP16);
net.setInput(blob);
Mat output = net.forward();
```

### TODO List
| Task | Status | Remarks |
|:-------:|:--------:|:------------:|
| Convolution 2D FP16 | ✔️ | Done |
| Winograd FP16 | Because the current modification has reached 2k lines, winograd fp16 will be completed in the next PR. |  |
| Accuracy Test | ✔️ | Done |
| Performance Test | ✔️ | Done |
| Compiler bug | ✔️ | Done |

### Speed Test for FP 16.

**Test on M1 chip, 4 threads.**

| Model Name | FP32 (Conv+Wino) | Conv(FP16) + Wino(FP 32) |
|:-------:|:--------:|:------------:|
| ReseNet 50 | 26.0 ms | **18.05 ms** (25% speed up)|
| MobileNet V2 | 4.17 ms | **3.09 ms (29% speed up)** |

### Speed Test for `seperateIm2col` trick on X86.
**Test on AMD 5600x, 12 threads.**
| Model Name | 4.x | Patch |
|:-------:|:--------:|:------------:|
| MobileNet V2 | 5.6 ms | **3.0 ms (46% speed up)** |

### Performance Test

#### Performance Test of X86 platform: AMD 5600X, with `-perf_threas=1`
|Name of Test|4.x|patch|patch vs 4.x (x-factor)|
|---|:-:|:-:|:-:|
|Name of Test|4.x 0|fp16pr final|fp16pr final vs 4.x 0 (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.00|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.03|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.92|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.003|0.95|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.006|0.006|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.045|0.033|1.39|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.011|0.009|1.17|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.109|0.078|1.39|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.040|0.042|0.94|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.326|0.342|0.95|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.580|0.589|0.99|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.293|1.382|0.94|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.590|3.710|0.97|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.120|1.191|0.94|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.576|2.872|0.90|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.599|4.670|0.98|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.230|9.582|0.96|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|65.946|69.381|0.95|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|18.915|19.289|0.98|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.404|1.457|0.96|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|2.060|1.501|1.37|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.409|1.464|0.96|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.793|1.838|0.98|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.207|1.199|1.01|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.277|1.275|1.00|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.319|2.370|0.98|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.351|1.346|1.00|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.520|3.612|0.97|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.876|1.880|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.981|1.995|0.99|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.620|2.627|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.202|4.123|1.02|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.429|2.445|0.99|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.591|2.576|1.01|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|3.005|2.998|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.515|3.532|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.115|3.134|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.937|3.899|1.01|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.533|5.471|1.01|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.472|3.464|1.00|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.302|4.322|1.00|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.100|6.035|1.01|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.580|6.484|1.01|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|9.741|9.634|1.01|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.131|10.156|1.00|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.391|12.350|1.00|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|91.074|87.893|1.04|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|5.903|5.903|1.00|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.890|6.794|1.01|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.160|5.131|1.01|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|4.970|5.036|0.99|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|5.045|5.015|1.01|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.583|11.343|1.02|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.348|5.320|1.01|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.357|5.396|0.99|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.050|6.006|1.01|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|5.952|5.953|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.014|8.014|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.472|12.577|0.99|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|10.803|10.655|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|18.429|13.405|1.37|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.659|6.647|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.192|13.819|1.03|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.045|6.068|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.742|12.828|0.99|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.046|7.773|1.04|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.440|17.192|1.01|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.418|14.972|1.03|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.430|0.430|1.00|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.692|6.663|1.00|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.350|6.347|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.267|0.265|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.755|7.558|1.03|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.203|0.202|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.663|10.576|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.827|10.614|1.02|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.049|6.947|1.01|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|6.900|6.901|1.00|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.165|0.165|1.00|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|17.953|17.251|1.04|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.430|7.320|1.01|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.187|21.705|1.02|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.349|8.126|1.03|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.273|8.297|1.00|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.169|8.094|1.01|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.602|13.359|1.02|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.633|8.584|1.01|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|29.339|28.897|1.02|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|13.000|12.920|1.01|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.262|13.319|1.07|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.453|27.253|1.01|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.052|27.269|1.18|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|15.363|15.208|1.01|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.543|18.434|1.01|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|39.114|37.954|1.03|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.271|36.972|0.98|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.262|19.427|0.99|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.298|19.349|1.00|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.261|19.847|1.02|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.867|21.525|1.02|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.756|49.979|1.04|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|28.133|27.060|1.04|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|25.035|24.980|1.00|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.858|25.821|1.00|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.313|27.149|1.01|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.219|28.111|1.00|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|46.025|46.674|0.99|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|30.220|29.446|1.03|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|49.410|48.708|1.01|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|38.203|38.001|1.01|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|39.961|39.021|1.02|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|48.685|47.075|1.03|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|75.114|72.586|1.03|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.222|41.144|1.00|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.220|46.353|1.00|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.201|98.771|0.99|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|100.106|96.971|1.03|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|146.977|140.445|1.05|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|198.618|194.665|1.02|


#### Performance Test of ARM platform: apple M1, with `-perf_threas=1`

Min (ms)

|Name of Test|4.x|patch|4.x vs patch (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|1.07|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|1.10|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.97|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|0.84|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.009|0.009|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.027|0.030|0.90|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.008|0.007|1.07|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.072|0.91|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.090|0.054|1.68|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.409|0.80|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.659|0.697|0.95|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.266|1.403|0.90|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.550|4.145|0.86|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.188|1.375|0.86|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.683|3.236|0.83|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.491|5.501|0.82|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.916|10.181|0.88|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.995|72.296|0.97|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|22.531|23.139|0.97|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.239|1.933|1.16|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU_FP16)|-|1.010|-|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.134|2.068|1.52|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU_FP16)|-|1.062|-|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.918|1.920|1.00|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU_FP16)|-|1.014|-|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.340|2.352|0.99|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.247|-|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.116|1.111|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU_FP16)|-|1.114|-|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.116|1.112|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|1.113|-|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.067|3.085|0.99|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.622|-|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.153|1.187|0.97|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU_FP16)|-|1.150|-|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.804|4.849|0.99|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU_FP16)|-|2.922|-|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.463|1.469|1.00|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.459|-|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.577|1.580|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|1.580|-|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.826|1.818|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|1.817|-|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|6.541|5.081|1.29|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|2.809|-|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.912|1.919|1.00|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU_FP16)|-|1.919|-|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.961|1.971|0.99|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|1.961|-|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.317|2.329|0.99|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.322|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.920|2.947|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|2.924|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.467|2.466|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|2.496|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|3.028|2.997|1.01|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|2.986|-|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.353|4.355|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|4.355|-|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.762|2.793|0.99|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|2.797|-|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.428|3.226|1.06|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU_FP16)|-|3.223|-|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|3.967|3.957|1.00|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU_FP16)|-|3.960|-|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|4.806|4.387|1.10|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU_FP16)|-|4.366|-|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.509|11.756|1.23|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|6.510|-|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|13.718|13.287|1.03|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.190|-|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.133|14.853|1.02|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU_FP16)|-|8.671|-|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|41.928|43.328|0.97|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|38.072|-|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.409|4.428|1.00|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.427|-|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.144|5.363|1.15|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU_FP16)|-|5.368|-|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.926|3.932|1.00|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.938|-|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.920|3.915|1.00|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.950|-|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|3.767|3.764|1.00|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|3.762|-|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|19.959|13.875|1.44|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU_FP16)|-|7.781|-|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|3.951|3.955|1.00|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|3.969|-|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.050|4.034|1.00|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.093|-|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.923|4.506|1.09|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.509|-|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.759|4.476|1.06|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.447|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|6.079|5.628|1.08|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU_FP16)|-|5.625|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.843|17.523|1.13|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.917|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.334|8.247|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU_FP16)|-|8.246|-|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|23.164|18.199|1.27|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.305|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.184|5.178|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU_FP16)|-|5.149|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.990|18.103|0.99|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|9.777|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.831|4.522|1.07|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU_FP16)|-|4.523|-|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.328|17.319|1.00|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|8.948|-|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|5.944|5.961|1.00|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU_FP16)|-|5.936|-|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.811|20.064|0.99|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU_FP16)|-|11.705|-|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.398|17.686|1.27|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU_FP16)|-|9.859|-|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.416|0.416|1.00|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.417|-|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.356|5.110|1.05|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU_FP16)|-|5.114|-|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|5.092|4.748|1.07|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU_FP16)|-|4.754|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.260|0.229|1.13|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.229|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.872|5.460|1.08|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU_FP16)|-|5.460|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.161|0.161|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.161|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.176|7.175|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|7.162|-|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.174|7.185|1.00|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU_FP16)|-|7.157|-|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.400|5.180|1.04|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.201|-|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.330|5.188|1.03|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.177|-|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.115|0.115|1.00|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|0.115|-|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|26.156|20.222|1.29|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU_FP16)|-|11.203|-|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.627|5.543|1.02|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU_FP16)|-|5.506|-|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.925|27.741|1.01|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU_FP16)|-|17.217|-|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.359|6.062|1.05|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.048|-|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|6.559|6.322|1.04|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|6.280|-|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.412|6.200|1.03|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.197|-|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.167|8.624|1.06|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU_FP16)|-|8.626|-|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|6.755|6.491|1.04|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU_FP16)|-|6.520|-|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.664|34.752|1.03|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|20.260|-|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.514|9.414|1.01|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.462|-|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.631|9.963|1.07|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|9.935|-|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.465|36.798|1.02|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|19.569|-|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|38.157|36.157|1.06|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU_FP16)|-|18.902|-|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.356|10.401|1.00|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|10.360|-|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.641|12.150|1.04|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|12.162|-|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.545|50.505|1.00|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|27.950|-|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|54.233|49.603|1.09|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU_FP16)|-|26.515|-|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|13.779|12.968|1.06|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|12.984|-|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|15.809|15.329|1.03|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|15.433|-|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|14.563|14.527|1.00|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|14.480|-|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|16.714|16.484|1.01|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|16.362|-|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.832|65.729|1.18|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU_FP16)|-|32.065|-|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.903|20.386|1.07|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|20.416|-|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.405|18.148|1.12|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU_FP16)|-|18.128|-|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.334|18.521|1.10|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|18.495|-|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.527|19.584|1.10|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|19.630|-|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.715|20.057|1.13|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU_FP16)|-|20.068|-|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|26.228|24.992|1.05|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|24.957|-|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.524|21.581|1.00|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|21.782|-|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|34.094|31.964|1.07|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|31.925|-|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|28.677|27.813|1.03|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.808|-|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|31.274|27.892|1.12|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|27.910|-|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|30.533|30.007|1.02|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|30.089|-|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|39.837|38.312|1.04|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|38.477|-|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|32.480|29.237|1.11|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU_FP16)|-|29.452|-|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|33.544|32.832|1.02|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU_FP16)|-|32.784|-|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|134.481|130.678|1.03|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU_FP16)|-|70.134|-|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.930|126.530|1.01|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU_FP16)|-|65.261|-|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|201.346|187.007|1.08|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU_FP16)|-|91.525|-|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|252.038|245.587|1.03|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU_FP16)|-|125.477|-|

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/23637/head
Zihao Mu 2 years ago committed by GitHub
parent 001a2c5195
commit 5229312ad2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      modules/dnn/include/opencv2/dnn/dnn.hpp
  2. 2
      modules/dnn/src/dnn_common.hpp
  3. 4
      modules/dnn/src/layers/convolution_layer.cpp
  4. 642
      modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
  5. 1927
      modules/dnn/src/layers/cpu_kernels/convolution.cpp
  6. 39
      modules/dnn/src/layers/cpu_kernels/convolution.hpp
  7. 2
      modules/dnn/src/layers/pooling_layer.cpp
  8. 12
      modules/dnn/src/net_impl.cpp
  9. 14
      modules/dnn/src/net_impl_backend.cpp
  10. 8
      modules/dnn/src/registry.cpp
  11. 27
      modules/dnn/test/test_backends.cpp
  12. 28
      modules/dnn/test/test_caffe_importer.cpp
  13. 3
      modules/dnn/test/test_common.hpp
  14. 3
      modules/dnn/test/test_common.impl.hpp
  15. 18
      modules/dnn/test/test_darknet_importer.cpp
  16. 6
      modules/dnn/test/test_googlenet.cpp
  17. 10
      modules/dnn/test/test_layers.cpp
  18. 20
      modules/dnn/test/test_model.cpp
  19. 2
      modules/dnn/test/test_onnx_conformance.cpp
  20. 2
      modules/dnn/test/test_onnx_importer.cpp
  21. 56
      modules/dnn/test/test_tf_importer.cpp
  22. 20
      modules/dnn/test/test_torch_importer.cpp

@ -106,6 +106,7 @@ CV__DNN_INLINE_NS_BEGIN
DNN_TARGET_CUDA_FP16, DNN_TARGET_CUDA_FP16,
DNN_TARGET_HDDL, DNN_TARGET_HDDL,
DNN_TARGET_NPU, DNN_TARGET_NPU,
DNN_TARGET_CPU_FP16, // Only the ARM platform is supported. Low precision computing, accelerate model inference.
}; };
/** /**

@ -13,7 +13,7 @@
namespace cv { namespace dnn { namespace cv { namespace dnn {
CV__DNN_INLINE_NS_BEGIN CV__DNN_INLINE_NS_BEGIN
#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16) #define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU) // TODO: add DNN_TARGET_CPU_FP16 #define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU || id == DNN_TARGET_CPU_FP16)
Mutex& getInitializationMutex(); Mutex& getInitializationMutex();
void initializeLayerFactory(); void initializeLayerFactory();

@ -428,7 +428,6 @@ public:
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{ {
BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr); BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
std::vector<Mat> inputs; std::vector<Mat> inputs;
inputs_arr.getMatVector(inputs); inputs_arr.getMatVector(inputs);
// prepare weightsMat where each row is aligned and has enough zero padding on the right to // prepare weightsMat where each row is aligned and has enough zero padding on the right to
@ -1405,7 +1404,8 @@ public:
CV_Assert(outputs[0].size[1] % ngroups == 0); CV_Assert(outputs[0].size[1] % ngroups == 0);
fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides, fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides,
dilations, pads_begin, pads_end, conv_dim, canUseWinograd); dilations, pads_begin, pads_end, conv_dim,
preferableTarget == DNN_TARGET_CPU_FP16, canUseWinograd);
} }
runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd); runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd);

@ -8,7 +8,7 @@ namespace cv {
namespace dnn { namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR); void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR);
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
@ -17,7 +17,7 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif #endif
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
{ {
CV_Assert(convMR == 4 && convNR == 24); CV_Assert(convMR == 4 && convNR == 24);
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00; __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
@ -28,29 +28,72 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
for (int p = 0; p < np; p++, a += convMR, b += convNR) if (width > 16)
{ {
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); for (int p = 0; p < np; p++, a += convMR, b += convNR)
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); {
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
c00 = _mm256_fmadd_ps(b0, a0, c00);
c01 = _mm256_fmadd_ps(b1, a0, c01);
c02 = _mm256_fmadd_ps(b2, a0, c02);
c10 = _mm256_fmadd_ps(b0, a1, c10);
c11 = _mm256_fmadd_ps(b1, a1, c11);
c12 = _mm256_fmadd_ps(b2, a1, c12);
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
c20 = _mm256_fmadd_ps(b0, a0, c20);
c21 = _mm256_fmadd_ps(b1, a0, c21);
c22 = _mm256_fmadd_ps(b2, a0, c22);
c30 = _mm256_fmadd_ps(b0, a1, c30);
c31 = _mm256_fmadd_ps(b1, a1, c31);
c32 = _mm256_fmadd_ps(b2, a1, c32);
}
}
else if (width > 8)
{
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8);
c00 = _mm256_fmadd_ps(b0, a0, c00); c00 = _mm256_fmadd_ps(b0, a0, c00);
c01 = _mm256_fmadd_ps(b1, a0, c01); c01 = _mm256_fmadd_ps(b1, a0, c01);
c02 = _mm256_fmadd_ps(b2, a0, c02);
c10 = _mm256_fmadd_ps(b0, a1, c10); c10 = _mm256_fmadd_ps(b0, a1, c10);
c11 = _mm256_fmadd_ps(b1, a1, c11); c11 = _mm256_fmadd_ps(b1, a1, c11);
c12 = _mm256_fmadd_ps(b2, a1, c12);
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
c20 = _mm256_fmadd_ps(b0, a0, c20); c20 = _mm256_fmadd_ps(b0, a0, c20);
c21 = _mm256_fmadd_ps(b1, a0, c21); c21 = _mm256_fmadd_ps(b1, a0, c21);
c22 = _mm256_fmadd_ps(b2, a0, c22);
c30 = _mm256_fmadd_ps(b0, a1, c30); c30 = _mm256_fmadd_ps(b0, a1, c30);
c31 = _mm256_fmadd_ps(b1, a1, c31); c31 = _mm256_fmadd_ps(b1, a1, c31);
c32 = _mm256_fmadd_ps(b2, a1, c32); }
} }
else
{
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
b0 = _mm256_load_ps(b);
c00 = _mm256_fmadd_ps(b0, a0, c00);
c10 = _mm256_fmadd_ps(b0, a1, c10);
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
c20 = _mm256_fmadd_ps(b0, a0, c20);
c30 = _mm256_fmadd_ps(b0, a1, c30);
}
}
if (!init_c) if (!init_c)
{ {
@ -87,7 +130,7 @@ namespace opt_NEON
{ {
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
{ {
#if CV_NEON_AARCH64 #if CV_NEON_AARCH64
if (convMR == 4 && convNR == 28) // AARCH64 if (convMR == 4 && convNR == 28) // AARCH64
@ -97,44 +140,105 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20; float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30; float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
for( int p = 0; p < np; p++, a += convMR, b += convNR ) if (width > 16)
{
for( int p = 0; p < np; p++, a += convMR, b += convNR )
{
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
b0 = vld1q_f32(b + 24);
c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
}
}
else if (width > 8)
{
for( int p = 0; p < np; p++, a += convMR, b += convNR )
{
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
b0 = vld1q_f32(b + 12);
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
}
}
else if (width > 4)
{
for( int p = 0; p < np; p++, a += convMR, b += convNR )
{
float32x4_t a0 = vld1q_f32(a), b0, b1;
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4);
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
}
}
else
{ {
float32x4_t a0 = vld1q_f32(a), b0, b1, b2; for( int p = 0; p < np; p++, a += convMR, b += convNR )
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); {
float32x4_t a0 = vld1q_f32(a), b0;
c00 = vfmaq_laneq_f32(c00, b0, a0, 0); b0 = vld1q_f32(b);
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
c02 = vfmaq_laneq_f32(c02, b2, a0, 0); c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
c10 = vfmaq_laneq_f32(c10, b0, a0, 1); c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
c11 = vfmaq_laneq_f32(c11, b1, a0, 1); c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
c12 = vfmaq_laneq_f32(c12, b2, a0, 1); c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c20 = vfmaq_laneq_f32(c20, b0, a0, 2); }
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
b0 = vld1q_f32(b + 24);
c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
} }
if (!init_c) if (!init_c)
@ -204,26 +308,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0; float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
for (int p = 0; p < np; p++, a += convMR, b += convNR) if (width > 8)
{
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
c1 = vmlaq_lane_f32(c1, b1, a0, 0);
c2 = vmlaq_lane_f32(c2, b2, a0, 0);
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
c4 = vmlaq_lane_f32(c4, b1, a0, 1);
c5 = vmlaq_lane_f32(c5, b2, a0, 1);
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
c7 = vmlaq_lane_f32(c7, b1, a1, 0);
c8 = vmlaq_lane_f32(c8, b2, a1, 0);
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
c10 = vmlaq_lane_f32(c10, b1, a1, 1);
c11 = vmlaq_lane_f32(c11, b2, a1, 1);
}
}
else if (width > 4)
{ {
a0 = vld1_f32(a), a1 = vld1_f32(a+2); for (int p = 0; p < np; p++, a += convMR, b += convNR)
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); {
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
c0 = vmlaq_lane_f32(c0, b0, a0, 0); c0 = vmlaq_lane_f32(c0, b0, a0, 0);
c1 = vmlaq_lane_f32(c1, b1, a0, 0); c1 = vmlaq_lane_f32(c1, b1, a0, 0);
c2 = vmlaq_lane_f32(c2, b2, a0, 0);
c3 = vmlaq_lane_f32(c3, b0, a0, 1); c3 = vmlaq_lane_f32(c3, b0, a0, 1);
c4 = vmlaq_lane_f32(c4, b1, a0, 1); c4 = vmlaq_lane_f32(c4, b1, a0, 1);
c5 = vmlaq_lane_f32(c5, b2, a0, 1);
c6 = vmlaq_lane_f32(c6, b0, a1, 0); c6 = vmlaq_lane_f32(c6, b0, a1, 0);
c7 = vmlaq_lane_f32(c7, b1, a1, 0); c7 = vmlaq_lane_f32(c7, b1, a1, 0);
c8 = vmlaq_lane_f32(c8, b2, a1, 0);
c9 = vmlaq_lane_f32(c9 , b0, a1, 1); c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
c10 = vmlaq_lane_f32(c10, b1, a1, 1); c10 = vmlaq_lane_f32(c10, b1, a1, 1);
c11 = vmlaq_lane_f32(c11, b2, a1, 1); }
}
else
{
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
b0 = vld1q_f32(b);
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
}
} }
if (!init_c) if (!init_c)
@ -254,6 +394,366 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock"); CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
} }
void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR)
{
CV_Assert(convNR == 28);
float32x4_t c0 = vdupq_n_f32(bias), c1 = c0, c2 = c0;
float32x4_t c3 = c0, c4 = c0, c5 = c0, c6 = c0;
if (width > 16)
{
for (int p = 0; p < np; p++, a++, b += convNR)
{
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
float32x4_t b3 = vld1q_f32(b + 12), b4 = vld1q_f32(b + 16), b5 = vld1q_f32(b + 20);
float32x4_t b6 = vld1q_f32(b + 24);
c0 = vmlaq_n_f32(c0, b0, a[0]);
c1 = vmlaq_n_f32(c1, b1, a[0]);
c2 = vmlaq_n_f32(c2, b2, a[0]);
c3 = vmlaq_n_f32(c3, b3, a[0]);
c4 = vmlaq_n_f32(c4, b4, a[0]);
c5 = vmlaq_n_f32(c5, b5, a[0]);
c6 = vmlaq_n_f32(c6, b6, a[0]);
}
}
else if (width > 8)
{
for (int p = 0; p < np; p++, a++, b += convNR)
{
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
float32x4_t b3 = vld1q_f32(b + 12);
c0 = vmlaq_n_f32(c0, b0, a[0]);
c1 = vmlaq_n_f32(c1, b1, a[0]);
c2 = vmlaq_n_f32(c2, b2, a[0]);
c3 = vmlaq_n_f32(c3, b3, a[0]);
}
}
else if (width > 4)
{
for (int p = 0; p < np; p++, a++, b += convNR)
{
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
c0 = vmlaq_n_f32(c0, b0, a[0]);
c1 = vmlaq_n_f32(c1, b1, a[0]);
}
}
else
{
for (int p = 0; p < np; p++, a++, b += convNR)
{
float32x4_t b0 = vld1q_f32(b);
c0 = vmlaq_n_f32(c0, b0, a[0]);
}
}
if (init_c)
{
c0 += vld1q_f32(c);
c1 += vld1q_f32(c + 4);
c2 += vld1q_f32(c + 8);
c3 += vld1q_f32(c + 12);
c4 += vld1q_f32(c + 16);
c5 += vld1q_f32(c + 20);
c6 += vld1q_f32(c + 24);
}
if (ifMinMaxAct)
{
float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval);
c0 = vminq_f32(vmaxq_f32(c0, v_minval), v_maxval);
c1 = vminq_f32(vmaxq_f32(c1, v_minval), v_maxval);
c2 = vminq_f32(vmaxq_f32(c2, v_minval), v_maxval);
c3 = vminq_f32(vmaxq_f32(c3, v_minval), v_maxval);
c4 = vminq_f32(vmaxq_f32(c4, v_minval), v_maxval);
c5 = vminq_f32(vmaxq_f32(c5, v_minval), v_maxval);
c6 = vminq_f32(vmaxq_f32(c6, v_minval), v_maxval);
}
vst1q_f32(c, c0);
vst1q_f32(c + 4, c1);
vst1q_f32(c + 8, c2);
vst1q_f32(c + 12, c3);
vst1q_f32(c + 16, c4);
vst1q_f32(c + 20, c5);
vst1q_f32(c + 24, c6);
}
#if CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
// Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
typedef __fp16 float16_t;
#ifndef __ARM_FEATURE_FMA // Work around without FMA support.
#define vfmaq_f16(a, b, c) (a + b * c)
#endif
void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
const int convMR_fp16, const int convNR_fp16)
{
#if 1
const float16_t* a = (const float16_t*)_a;
const float16_t* b = (const float16_t*)_b;
float16_t* c = (float16_t*)_c;
CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24);
float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00;
float16x8_t c10 = c00, c11 = c00, c12 = c00;
float16x8_t c20 = c00, c21 = c00, c22 = c00;
float16x8_t c30 = c00, c31 = c00, c32 = c00;
float16x8_t c40 = c00, c41 = c00, c42 = c00;
float16x8_t c50 = c00, c51 = c00, c52 = c00;
float16x8_t c60 = c00, c61 = c00, c62 = c00;
float16x8_t c70 = c00, c71 = c00, c72 = c00;
float16x8_t b0 = c00, b1 = c00, b2 = c00;
if (width > 16)
{
for (int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
{
float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
c00 = vfmaq_lane_f16(c00, b0, a0, 0);
c01 = vfmaq_lane_f16(c01, b1, a0, 0);
c02 = vfmaq_lane_f16(c02, b2, a0, 0);
c10 = vfmaq_lane_f16(c10, b0, a0, 1);
c11 = vfmaq_lane_f16(c11, b1, a0, 1);
c12 = vfmaq_lane_f16(c12, b2, a0, 1);
c20 = vfmaq_lane_f16(c20, b0, a0, 2);
c21 = vfmaq_lane_f16(c21, b1, a0, 2);
c22 = vfmaq_lane_f16(c22, b2, a0, 2);
c30 = vfmaq_lane_f16(c30, b0, a0, 3);
c31 = vfmaq_lane_f16(c31, b1, a0, 3);
c32 = vfmaq_lane_f16(c32, b2, a0, 3);
c40 = vfmaq_lane_f16(c40, b0, a1, 0);
c41 = vfmaq_lane_f16(c41, b1, a1, 0);
c42 = vfmaq_lane_f16(c42, b2, a1, 0);
c50 = vfmaq_lane_f16(c50, b0, a1, 1);
c51 = vfmaq_lane_f16(c51, b1, a1, 1);
c52 = vfmaq_lane_f16(c52, b2, a1, 1);
c60 = vfmaq_lane_f16(c60, b0, a1, 2);
c61 = vfmaq_lane_f16(c61, b1, a1, 2);
c62 = vfmaq_lane_f16(c62, b2, a1, 2);
c70 = vfmaq_lane_f16(c70, b0, a1, 3);
c71 = vfmaq_lane_f16(c71, b1, a1, 3);
c72 = vfmaq_lane_f16(c72, b2, a1, 3);
}
}
else if (width > 8)
{
for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
{
float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8);
c00 = vfmaq_lane_f16(c00, b0, a0, 0);
c01 = vfmaq_lane_f16(c01, b1, a0, 0);
c10 = vfmaq_lane_f16(c10, b0, a0, 1);
c11 = vfmaq_lane_f16(c11, b1, a0, 1);
c20 = vfmaq_lane_f16(c20, b0, a0, 2);
c21 = vfmaq_lane_f16(c21, b1, a0, 2);
c30 = vfmaq_lane_f16(c30, b0, a0, 3);
c31 = vfmaq_lane_f16(c31, b1, a0, 3);
c40 = vfmaq_lane_f16(c40, b0, a1, 0);
c41 = vfmaq_lane_f16(c41, b1, a1, 0);
c50 = vfmaq_lane_f16(c50, b0, a1, 1);
c51 = vfmaq_lane_f16(c51, b1, a1, 1);
c60 = vfmaq_lane_f16(c60, b0, a1, 2);
c61 = vfmaq_lane_f16(c61, b1, a1, 2);
c70 = vfmaq_lane_f16(c70, b0, a1, 3);
c71 = vfmaq_lane_f16(c71, b1, a1, 3);
}
}
else
{
for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
{
float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
float16x8_t b0 = vld1q_f16(b);
c00 = vfmaq_lane_f16(c00, b0, a0, 0);
c10 = vfmaq_lane_f16(c10, b0, a0, 1);
c20 = vfmaq_lane_f16(c20, b0, a0, 2);
c30 = vfmaq_lane_f16(c30, b0, a0, 3);
c40 = vfmaq_lane_f16(c40, b0, a1, 0);
c50 = vfmaq_lane_f16(c50, b0, a1, 1);
c60 = vfmaq_lane_f16(c60, b0, a1, 2);
c70 = vfmaq_lane_f16(c70, b0, a1, 3);
}
}
if (!init_c)
{
#undef _FX_UPDATE_CBUF_ROW
#define _FX_UPDATE_CBUF_ROW(row) \
c##row##0 = c##row##0 + vld1q_f16(c + row*ldc); \
c##row##1 = c##row##1 + vld1q_f16(c + row*ldc + 8); \
c##row##2 = c##row##2 + vld1q_f16(c + row*ldc + 16)
_FX_UPDATE_CBUF_ROW(0);
_FX_UPDATE_CBUF_ROW(1);
_FX_UPDATE_CBUF_ROW(2);
_FX_UPDATE_CBUF_ROW(3);
_FX_UPDATE_CBUF_ROW(4);
_FX_UPDATE_CBUF_ROW(5);
_FX_UPDATE_CBUF_ROW(6);
_FX_UPDATE_CBUF_ROW(7);
}
#undef _FX_STORE_CBUF_ROW
#define _FX_STORE_CBUF_ROW(row) \
vst1q_f16(c + row*ldc, c##row##0); \
vst1q_f16(c + row*ldc + 8, c##row##1); \
vst1q_f16(c + row*ldc + 16, c##row##2)
_FX_STORE_CBUF_ROW(0);
_FX_STORE_CBUF_ROW(1);
_FX_STORE_CBUF_ROW(2);
_FX_STORE_CBUF_ROW(3);
_FX_STORE_CBUF_ROW(4);
_FX_STORE_CBUF_ROW(5);
_FX_STORE_CBUF_ROW(6);
_FX_STORE_CBUF_ROW(7);
#else
// reference only.
const float16_t* a = (const float16_t*)_a;
const float16_t* b = (const float16_t*)_b;
float16_t* c = (float16_t*)_c;
float cbuf[convMR_fp16*convNR_fp16];
memset(cbuf, 0, sizeof(cbuf));
for( int p = 0; p < np; p++ )
{
for( int i = 0; i < convMR_fp16; i++ )
{
float ai = float(a[convMR_fp16*p + i]);
for( int j = 0; j < convNR_fp16; j++ )
cbuf[i*convNR_fp16+j] += float(b[convNR_fp16*p + j]) * ai;
}
}
if (!init_c)
{
for(int i = 0; i < convMR_fp16; i++)
{
for(int j = 0; j < convNR_fp16; j++)
c[i*ldc + j] = float16_t(float(c[i*ldc + j]) + cbuf[i*convNR_fp16 + j]);
}
}
else
{
for(int i = 0; i < convMR_fp16; i++)
{
for(int j = 0; j < convNR_fp16; j++)
c[i*ldc + j] = (float16_t)(cbuf[i*convNR_fp16 + j]);
}
}
#endif
}
void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
{
CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24
const float16_t* a = (const float16_t*)_a;
const float16_t* b = (const float16_t*)_b;
const float16_t bias = (float16_t)_bias;
float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0;
if (width > 16)
{
for (int p = 0; p < np; p++, a++, b += convNR_FP16)
{
float16x8_t a0= vdupq_n_f16(a[0]);
float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
c0 = vfmaq_f16(c0, a0, b0);
c1 = vfmaq_f16(c1, a0, b1);
c2 = vfmaq_f16(c2, a0, b2);
}
}
else if (width > 8)
{
for (int p = 0; p < np; p++, a++, b += convNR_FP16)
{
float16x8_t a0= vdupq_n_f16(a[0]);
float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8);
c0 = vfmaq_f16(c0, a0, b0);
c1 = vfmaq_f16(c1, a0, b1);
}
}
else
{
for (int p = 0; p < np; p++, a++, b += convNR_FP16)
{
float16x8_t a0= vdupq_n_f16(a[0]);
float16x8_t b0 = vld1q_f16(b);
c0 = vfmaq_f16(c0, a0, b0);
}
}
// convert FP 16 to FP 32.
float32x4_t c00 = vcvt_f32_f16(vget_low_f16(c0));
float32x4_t c01 = vcvt_f32_f16(vget_high_f16(c0));
float32x4_t c10 = vcvt_f32_f16(vget_low_f16(c1));
float32x4_t c11 = vcvt_f32_f16(vget_high_f16(c1));
float32x4_t c20 = vcvt_f32_f16(vget_low_f16(c2));
float32x4_t c21 = vcvt_f32_f16(vget_high_f16(c2));
if (init_c)
{
c00 += vld1q_f32(c);
c01 += vld1q_f32(c + 4);
c10 += vld1q_f32(c + 8);
c11 += vld1q_f32(c + 12);
c20 += vld1q_f32(c + 16);
c21 += vld1q_f32(c + 20);
}
if (ifMinMaxAct)
{
float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval);
c00 = vminq_f32(vmaxq_f32(c00, v_minval), v_maxval);
c01 = vminq_f32(vmaxq_f32(c01, v_minval), v_maxval);
c10 = vminq_f32(vmaxq_f32(c10, v_minval), v_maxval);
c11 = vminq_f32(vmaxq_f32(c11, v_minval), v_maxval);
c20 = vminq_f32(vmaxq_f32(c20, v_minval), v_maxval);
c21 = vminq_f32(vmaxq_f32(c21, v_minval), v_maxval);
}
vst1q_f32(c, c00);
vst1q_f32(c + 4, c01);
vst1q_f32(c + 8, c10);
vst1q_f32(c + 12, c11);
vst1q_f32(c + 16, c20);
vst1q_f32(c + 20, c21);
}
#endif
#endif #endif
} }
}} // namespace cv::dnn }} // namespace cv::dnn

File diff suppressed because it is too large Load Diff

@ -10,14 +10,27 @@
#ifndef CONV_PRAM #ifndef CONV_PRAM
#define CONV_PRAM #define CONV_PRAM
#if CV_NEON && CV_NEON_AARCH64 // 32 registers. #if CV_NEON && CV_NEON_AARCH64 // 32 registers.
#define CONV_MR 4 #define CONV_MR_FP32 4
#define CONV_NR 28 #define CONV_NR_FP32 28
// The FP16 can only be supported by ARM64 and with FP16 FMA supported.
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
#define CONV_ARM_FP16 1
#endif
#ifdef CONV_ARM_FP16
// Currently, only ARM 64 support FP16.
#define CONV_MR_FP16 8
#define CONV_NR_FP16 24
typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
#endif
#elif CV_NEON // 16 registers. #elif CV_NEON // 16 registers.
#define CONV_MR 4 #define CONV_MR_FP32 4
#define CONV_NR 12 #define CONV_NR_FP32 12
#else // SIMD 128, AVX or AVX2 #else // SIMD 128, AVX or AVX2
#define CONV_MR 4 #define CONV_MR_FP32 4
#define CONV_NR 24 #define CONV_NR_FP32 24
#endif #endif
// Winograd Params // Winograd Params
@ -41,6 +54,10 @@ enum {
#endif #endif
CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16. CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
// FP 16
CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2,
CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16,
}; };
// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN. // NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
@ -64,8 +81,17 @@ struct FastConv
std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3). std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3).
float* weightsWinoBufPtr; float* weightsWinoBufPtr;
std::vector<float> biasBuf; std::vector<float> biasBuf;
#if CV_NEON && CV_NEON_AARCH64 && CV_FP16
std::vector<float16_t> weightsBuf_FP16;
float16_t* weightsBufPtr_FP16;
std::vector<float16_t> weightsWinoBuf_FP16;
float16_t* weightsWinoBufPtr_FP16;
#endif
int conv_type; int conv_type;
int conv_dim; // Flag for conv1d, conv2d, or conv3d. int conv_dim; // Flag for conv1d, conv2d, or conv3d.
bool useFP16 = false; // Only ARMv8 is supported.
#if CV_SIMD128 #if CV_SIMD128
bool useSIMD128 = true; bool useSIMD128 = true;
#else #else
@ -95,6 +121,7 @@ Ptr<FastConv> initFastConv(
const std::vector<size_t>& pads_begin, const std::vector<size_t>& pads_begin,
const std::vector<size_t>& pads_end, const std::vector<size_t>& pads_end,
int conv_dim, int conv_dim,
const bool useFP16,
bool useWinograd); bool useWinograd);
// It contains different computing branches, like winograd, 1x1 conv. // It contains different computing branches, like winograd, 1x1 conv.

@ -215,7 +215,7 @@ public:
if (backendId == DNN_BACKEND_OPENCV) if (backendId == DNN_BACKEND_OPENCV)
{ {
if (kernel_size.size() == 3) if (kernel_size.size() == 3)
return preferableTarget == DNN_TARGET_CPU; return IS_DNN_CPU_TARGET(preferableTarget);
if (kernel_size.size() <= 2) if (kernel_size.size() <= 2)
return true; return true;
else else

@ -98,6 +98,7 @@ void Net::Impl::validateBackendAndTarget()
CV_Assert(preferableBackend != DNN_BACKEND_OPENCV || CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU ||
preferableTarget == DNN_TARGET_CPU_FP16 ||
preferableTarget == DNN_TARGET_OPENCL || preferableTarget == DNN_TARGET_OPENCL ||
preferableTarget == DNN_TARGET_OPENCL_FP16); preferableTarget == DNN_TARGET_OPENCL_FP16);
CV_Assert(preferableBackend != DNN_BACKEND_HALIDE || CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
@ -972,7 +973,8 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
} }
else if (outputBlobs.isMatVector()) else if (outputBlobs.isMatVector())
{ {
if (preferableTarget != DNN_TARGET_CPU) // The DNN_TARGET_CPU and DNN_TARGET_CPU_FP16 both use the CPU memory, do not need the copyToHost.
if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16)
{ {
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
{ {
@ -1336,7 +1338,7 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const
"the #%d was requested", "the #%d was requested",
ld.name.c_str(), ld.outputBlobs.size(), pin.oid)); ld.name.c_str(), ld.outputBlobs.size(), pin.oid));
} }
if (preferableTarget != DNN_TARGET_CPU) if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16)
{ {
CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty()); CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
// Transfer data to CPU if it's require. // Transfer data to CPU if it's require.
@ -1552,7 +1554,7 @@ string Net::Impl::dump(bool forceAllocation) const
prevNode = itBackend->second; prevNode = itBackend->second;
} }
} }
std::vector<string> colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371"}; std::vector<string> colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371", "#ffcab3"};
string backend; string backend;
switch (prefBackend) switch (prefBackend)
{ {
@ -1755,6 +1757,10 @@ string Net::Impl::dump(bool forceAllocation) const
out << "NPU"; out << "NPU";
colorId = 9; colorId = 9;
break; break;
case DNN_TARGET_CPU_FP16:
out << "CPU_FP16";
colorId = 10;
break;
// don't use default: // don't use default:
} }
CV_Assert(colorId < colors.size()); CV_Assert(colorId < colors.size());

@ -17,7 +17,8 @@ CV__DNN_INLINE_NS_BEGIN
Ptr<BackendWrapper> Net::Impl::wrap(Mat& host) Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
{ {
if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU) if (preferableBackend == DNN_BACKEND_OPENCV &&
(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16))
return Ptr<BackendWrapper>(); return Ptr<BackendWrapper>();
MatShape shape(host.dims); MatShape shape(host.dims);
@ -104,7 +105,7 @@ void Net::Impl::initBackend(const std::vector<LayerPin>& blobsToKeep_)
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
if (preferableBackend == DNN_BACKEND_OPENCV) if (preferableBackend == DNN_BACKEND_OPENCV)
{ {
CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget)); CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16 || IS_DNN_OPENCL_TARGET(preferableTarget));
} }
else if (preferableBackend == DNN_BACKEND_HALIDE) else if (preferableBackend == DNN_BACKEND_HALIDE)
{ {
@ -232,6 +233,15 @@ void Net::Impl::setPreferableTarget(int targetId)
preferableTarget = DNN_TARGET_OPENCL; preferableTarget = DNN_TARGET_OPENCL;
#endif #endif
} }
#if !defined(__arm64__) || !__arm64__
if (targetId == DNN_TARGET_CPU_FP16)
{
CV_LOG_WARNING(NULL, "DNN: fall back to DNN_TARGET_CPU. Only ARM v8 CPU is supported by DNN_TARGET_CPU_FP16.");
targetId = DNN_TARGET_CPU;
}
#endif
clear(); clear();
} }
} }

@ -61,6 +61,11 @@ private:
} }
#endif #endif
bool haveBackendCPU_FP16 = false;
#if defined(__arm64__) && __arm64__
haveBackendCPU_FP16 = true;
#endif
if (haveBackendOpenVINO && openvino::checkTarget(DNN_TARGET_CPU)) if (haveBackendOpenVINO && openvino::checkTarget(DNN_TARGET_CPU))
{ {
backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU)); backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
@ -104,6 +109,9 @@ private:
backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)); backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
if (haveBackendCPU_FP16)
backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU_FP16));
#ifdef HAVE_VULKAN #ifdef HAVE_VULKAN
if (haveVulkan()) if (haveVulkan())
backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)); backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));

@ -175,6 +175,8 @@ TEST_P(DNNTestNetwork, ENet)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution", processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" : target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
"dnn/halide_scheduler_enet.yml", "dnn/halide_scheduler_enet.yml",
@ -189,7 +191,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE); applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
Mat sample = imread(findDataFile("dnn/street.png")); Mat sample = imread(findDataFile("dnn/street.png"));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 1.5e-2 : 0.0; float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0; float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0;
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN; float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN;
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
@ -225,7 +227,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
Mat sample = imread(findDataFile("dnn/street.png")); Mat sample = imread(findDataFile("dnn/street.png"));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false);
float scoreDiff = 0.0, iouDiff = 0.0; float scoreDiff = 0.0, iouDiff = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.029; scoreDiff = 0.029;
iouDiff = 0.09; iouDiff = 0.09;
@ -242,7 +244,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow) TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
{ {
applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB); applyTestTag((target == DNN_TARGET_CPU || target == DNN_TARGET_CPU_FP16) ? "" : CV_TEST_TAG_MEMORY_512MB);
if (backend == DNN_BACKEND_HALIDE) if (backend == DNN_BACKEND_HALIDE)
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE); applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
@ -250,7 +252,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.216 : 0.2; float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.216 : 0.2;
float scoreDiff = 0.0, iouDiff = 0.0; float scoreDiff = 0.0, iouDiff = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.095; scoreDiff = 0.095;
iouDiff = 0.09; iouDiff = 0.09;
@ -282,7 +284,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height)
Mat sample = imread(findDataFile("dnn/street.png")); Mat sample = imread(findDataFile("dnn/street.png"));
Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false); Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false);
float scoreDiff = 0.0, iouDiff = 0.0; float scoreDiff = 0.0, iouDiff = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.013; scoreDiff = 0.013;
iouDiff = 0.06; iouDiff = 0.06;
@ -306,7 +308,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
Mat sample = imread(findDataFile("dnn/street.png")); Mat sample = imread(findDataFile("dnn/street.png"));
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
float scoreDiff = 2e-5, iouDiff = 0.0; float scoreDiff = 2e-5, iouDiff = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.013; scoreDiff = 0.013;
iouDiff = 0.062; iouDiff = 0.062;
@ -332,7 +334,7 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
float scoreDiff = 0.0, iouDiff = 0.0; float scoreDiff = 0.0, iouDiff = 0.0;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.04; scoreDiff = 0.04;
} }
@ -387,7 +389,7 @@ TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
// output range: [-0.001, 0.97] // output range: [-0.001, 0.97]
const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.02 : 0.0; const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.02 : 0.0;
const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.2 : 0.0; const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.2 : 0.0;
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt", processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
Size(46, 46), "", "", l1, lInf); Size(46, 46), "", "", l1, lInf);
expectNoFallbacksFromIE(net); expectNoFallbacksFromIE(net);
@ -461,7 +463,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
Mat sample = imread(findDataFile("dnn/street.png")); Mat sample = imread(findDataFile("dnn/street.png"));
Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
float scoreDiff = 0.0, iouDiff = 0.0; float scoreDiff = 0.0, iouDiff = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.02; scoreDiff = 0.02;
iouDiff = 0.1; iouDiff = 0.1;
@ -483,7 +485,7 @@ TEST_P(DNNTestNetwork, DenseNet_121)
applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE); applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
// Reference output values are in range [-3.807, 4.605] // Reference output values are in range [-3.807, 4.605]
float l1 = 0.0, lInf = 0.0; float l1 = 0.0, lInf = 0.0;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 2e-2; l1 = 2e-2;
lInf = 9e-2; lInf = 9e-2;
@ -538,6 +540,11 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
l1 = 0.3; l1 = 0.3;
lInf = 7.6; lInf = 7.6;
} }
else if (target == DNN_TARGET_CPU_FP16)
{
l1 = 0.4;
lInf = 19.;
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000) #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)

@ -153,7 +153,7 @@ TEST_P(Test_Caffe_nets, Axpy)
} }
} }
float l1 = 1e-5, lInf = 1e-4; float l1 = 1e-5, lInf = 1e-4;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 2e-4; l1 = 2e-4;
lInf = 1e-3; lInf = 1e-3;
@ -180,7 +180,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
#else #else
applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
#endif #endif
ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU); ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16);
bool readFromMemory = get<0>(GetParam()); bool readFromMemory = get<0>(GetParam());
Net net; Net net;
@ -214,7 +214,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
ASSERT_EQ(inLayerShapes[0][3], 227); ASSERT_EQ(inLayerShapes[0][3], 227);
const float l1 = 1e-5; const float l1 = 1e-5;
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-3 : 1e-4; const float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 4e-3 : 1e-4;
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
@ -308,7 +308,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
ASSERT_EQ(out.size[2], 100); ASSERT_EQ(out.size[2], 100);
float scores_diff = 1e-5, boxes_iou_diff = 1e-4; float scores_diff = 1e-5, boxes_iou_diff = 1e-4;
if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_CPU_FP16)
{ {
scores_diff = 1.5e-2; scores_diff = 1.5e-2;
boxes_iou_diff = 6.3e-2; boxes_iou_diff = 6.3e-2;
@ -375,7 +375,7 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
{ {
Target targetId = GetParam(); Target targetId = GetParam();
applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU); ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16);
Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"), Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"),
findDataFile("dnn/ResNet-50-model.caffemodel", false)); findDataFile("dnn/ResNet-50-model.caffemodel", false));
@ -383,8 +383,8 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5; float l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 3e-5 : 1e-5;
float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4; float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 6e-3 : 1e-4;
Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false); Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
ASSERT_TRUE(!input.empty()); ASSERT_TRUE(!input.empty());
@ -415,6 +415,8 @@ TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
int targetId = GetParam(); int targetId = GetParam();
if(targetId == DNN_TARGET_OPENCL_FP16) if(targetId == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if(targetId == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"), Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"),
findDataFile("dnn/squeezenet_v1.1.caffemodel", false)); findDataFile("dnn/squeezenet_v1.1.caffemodel", false));
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(DNN_BACKEND_OPENCV);
@ -509,7 +511,7 @@ TEST_P(Test_Caffe_nets, Colorization)
// Reference output values are in range [-29.1, 69.5] // Reference output values are in range [-29.1, 69.5]
double l1 = 4e-4, lInf = 3e-3; double l1 = 4e-4, lInf = 3e-3;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 0.25; l1 = 0.25;
lInf = 5.3; lInf = 5.3;
@ -566,7 +568,7 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
{ {
l1 = 0.11; lInf = 0.5; l1 = 0.11; lInf = 0.5;
} }
else if (target == DNN_TARGET_CUDA_FP16) else if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 0.04; lInf = 0.2; l1 = 0.04; lInf = 0.2;
} }
@ -635,6 +637,8 @@ TEST_P(opencv_face_detector, Accuracy)
if (targetId == DNN_TARGET_OPENCL_FP16) if (targetId == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (targetId == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
Net net = readNetFromCaffe(proto, model); Net net = readNetFromCaffe(proto, model);
Mat img = imread(findDataFile("gpu/lbpcascade/er.png")); Mat img = imread(findDataFile("gpu/lbpcascade/er.png"));
@ -665,6 +669,8 @@ TEST_P(opencv_face_detector, issue_15106)
if (targetId == DNN_TARGET_OPENCL_FP16) if (targetId == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (targetId == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
Net net = readNetFromCaffe(proto, model); Net net = readNetFromCaffe(proto, model);
Mat img = imread(findDataFile("cv/shared/lena.png")); Mat img = imread(findDataFile("cv/shared/lena.png"));
@ -768,6 +774,8 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
if (target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_CUDA_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395, static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762, 0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176); 0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
@ -783,7 +791,7 @@ TEST_P(Test_Caffe_nets, RFCN)
); );
float scoreDiff = default_l1, iouDiff = default_lInf; float scoreDiff = default_l1, iouDiff = default_lInf;
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
{ {
scoreDiff = 4e-3; scoreDiff = 4e-3;
iouDiff = 8e-2; iouDiff = 8e-2;

@ -21,6 +21,7 @@
#define CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND "dnn_skip_opencv_backend" #define CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND "dnn_skip_opencv_backend"
#define CV_TEST_TAG_DNN_SKIP_HALIDE "dnn_skip_halide" #define CV_TEST_TAG_DNN_SKIP_HALIDE "dnn_skip_halide"
#define CV_TEST_TAG_DNN_SKIP_CPU "dnn_skip_cpu" #define CV_TEST_TAG_DNN_SKIP_CPU "dnn_skip_cpu"
#define CV_TEST_TAG_DNN_SKIP_CPU_FP16 "dnn_skip_cpu_fp16"
#define CV_TEST_TAG_DNN_SKIP_OPENCL "dnn_skip_ocl" #define CV_TEST_TAG_DNN_SKIP_OPENCL "dnn_skip_ocl"
#define CV_TEST_TAG_DNN_SKIP_OPENCL_FP16 "dnn_skip_ocl_fp16" #define CV_TEST_TAG_DNN_SKIP_OPENCL_FP16 "dnn_skip_ocl_fp16"
#define CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER "dnn_skip_ie_nn_builder" #define CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER "dnn_skip_ie_nn_builder"
@ -164,7 +165,7 @@ public:
static void getDefaultThresholds(int backend, int target, double* l1, double* lInf) static void getDefaultThresholds(int backend, int target, double* l1, double* lInf)
{ {
if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
{ {
*l1 = 4e-3; *l1 = 4e-3;
*lInf = 2e-2; *lInf = 2e-2;

@ -49,6 +49,7 @@ void PrintTo(const cv::dnn::Target& v, std::ostream* os)
case DNN_TARGET_CUDA: *os << "CUDA"; return; case DNN_TARGET_CUDA: *os << "CUDA"; return;
case DNN_TARGET_CUDA_FP16: *os << "CUDA_FP16"; return; case DNN_TARGET_CUDA_FP16: *os << "CUDA_FP16"; return;
case DNN_TARGET_NPU: *os << "NPU"; return; case DNN_TARGET_NPU: *os << "NPU"; return;
case DNN_TARGET_CPU_FP16: *os << "CPU_FP16"; return;
} // don't use "default:" to emit compiler warnings } // don't use "default:" to emit compiler warnings
*os << "DNN_TARGET_UNKNOWN(" << (int)v << ")"; *os << "DNN_TARGET_UNKNOWN(" << (int)v << ")";
} }
@ -439,7 +440,7 @@ void initDNNTests()
registerGlobalSkipTag( registerGlobalSkipTag(
CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND, CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND,
CV_TEST_TAG_DNN_SKIP_CPU, CV_TEST_TAG_DNN_SKIP_CPU, CV_TEST_TAG_DNN_SKIP_CPU_FP16,
CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16 CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16
); );
#if defined(HAVE_HALIDE) #if defined(HAVE_HALIDE)

@ -360,9 +360,9 @@ TEST_P(Test_Darknet_nets, YoloVoc)
1, 6, 0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f, // a car 1, 6, 0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f, // a car
1, 6, 0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f); // a car 1, 6, 0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f); // a car
double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4; double nmsThreshold = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.397 : 0.4;
double scoreDiff = 8e-5, iouDiff = 3e-4; double scoreDiff = 8e-5, iouDiff = 3e-4;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 1e-2; scoreDiff = 1e-2;
iouDiff = 0.018; iouDiff = 0.018;
@ -451,7 +451,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
1, 6, 0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); // a car 1, 6, 0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); // a car
double scoreDiff = 8e-5, iouDiff = 3e-4; double scoreDiff = 8e-5, iouDiff = 3e-4;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 8e-3; scoreDiff = 8e-3;
iouDiff = 0.018; iouDiff = 0.018;
@ -636,7 +636,7 @@ TEST_P(Test_Darknet_nets, YOLOv3)
Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
double scoreDiff = 8e-5, iouDiff = 3e-4; double scoreDiff = 8e-5, iouDiff = 3e-4;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000) #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@ -725,8 +725,8 @@ TEST_P(Test_Darknet_nets, YOLOv4)
}; };
Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : 8e-5; double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.006 : 8e-5;
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.042 : 3e-4; double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.042 : 3e-4;
if (target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_CUDA_FP16)
{ {
scoreDiff = 0.008; scoreDiff = 0.008;
@ -847,7 +847,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
double scoreDiff = 0.012f; double scoreDiff = 0.012f;
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.15 : 0.01f; double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.15 : 0.01f;
if (target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_CUDA_FP16)
iouDiff = 0.02; iouDiff = 0.02;
@ -930,7 +930,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish)
double scoreDiff = 8e-5; double scoreDiff = 8e-5;
double iouDiff = 3e-4; double iouDiff = 3e-4;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.006; scoreDiff = 0.006;
iouDiff = 0.042; iouDiff = 0.042;
@ -1093,6 +1093,8 @@ TEST_P(Test_Darknet_layers, connected)
{ {
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
testDarknetLayer("connected", true); testDarknetLayer("connected", true);
} }

@ -58,6 +58,8 @@ TEST_P(Reproducibility_GoogLeNet, Batching)
const int targetId = GetParam(); const int targetId = GetParam();
if (targetId == DNN_TARGET_OPENCL_FP16) if (targetId == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (targetId == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
findDataFile("dnn/bvlc_googlenet.caffemodel", false)); findDataFile("dnn/bvlc_googlenet.caffemodel", false));
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(DNN_BACKEND_OPENCV);
@ -89,6 +91,8 @@ TEST_P(Reproducibility_GoogLeNet, IntermediateBlobs)
const int targetId = GetParam(); const int targetId = GetParam();
if (targetId == DNN_TARGET_OPENCL_FP16) if (targetId == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (targetId == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
findDataFile("dnn/bvlc_googlenet.caffemodel", false)); findDataFile("dnn/bvlc_googlenet.caffemodel", false));
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(DNN_BACKEND_OPENCV);
@ -120,6 +124,8 @@ TEST_P(Reproducibility_GoogLeNet, SeveralCalls)
const int targetId = GetParam(); const int targetId = GetParam();
if (targetId == DNN_TARGET_OPENCL_FP16) if (targetId == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (targetId == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
findDataFile("dnn/bvlc_googlenet.caffemodel", false)); findDataFile("dnn/bvlc_googlenet.caffemodel", false));
net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableBackend(DNN_BACKEND_OPENCV);

@ -212,6 +212,8 @@ TEST_P(Test_Caffe_layers, InnerProduct)
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
testLayerUsingCaffeModels("layer_inner_product", true); testLayerUsingCaffeModels("layer_inner_product", true);
} }
@ -378,7 +380,7 @@ TEST_P(Test_Caffe_layers, Eltwise)
TEST_P(Test_Caffe_layers, PReLU) TEST_P(Test_Caffe_layers, PReLU)
{ {
double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.021 : 0.0; double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.021 : 0.0;
testLayerUsingCaffeModels("layer_prelu", true, true, 0.0, lInf); testLayerUsingCaffeModels("layer_prelu", true, true, 0.0, lInf);
} }
@ -2459,7 +2461,7 @@ TEST_P(ConvolutionActivationFusion, Accuracy)
std::vector<int> expectedFusedLayers; std::vector<int> expectedFusedLayers;
if (backendId == DNN_BACKEND_OPENCV) if (backendId == DNN_BACKEND_OPENCV)
{ {
if (targetId == DNN_TARGET_CPU) if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
expectedFusedLayers.push_back(activId); // all activations are fused expectedFusedLayers.push_back(activId); // all activations are fused
else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
{ {
@ -2594,7 +2596,7 @@ TEST_P(ConvolutionEltwiseActivationFusion, Accuracy)
std::vector<int> expectedFusedLayers; std::vector<int> expectedFusedLayers;
if (backendId == DNN_BACKEND_OPENCV) if (backendId == DNN_BACKEND_OPENCV)
{ {
if (targetId == DNN_TARGET_CPU) if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
expectedFusedLayers.push_back(activId); // activation is fused with eltwise layer expectedFusedLayers.push_back(activId); // activation is fused with eltwise layer
else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
{ {
@ -2683,7 +2685,7 @@ TEST_P(ConvolutionActivationEltwiseFusion, Accuracy)
std::vector<int> expectedFusedLayers; std::vector<int> expectedFusedLayers;
if (backendId == DNN_BACKEND_OPENCV) if (backendId == DNN_BACKEND_OPENCV)
{ {
if (targetId == DNN_TARGET_CPU) if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
expectedFusedLayers.push_back(activId); // activation fused with convolution expectedFusedLayers.push_back(activId); // activation fused with convolution
else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
{ {

@ -332,7 +332,7 @@ TEST_P(Test_Model, DetectRegion)
double confThreshold = 0.24; double confThreshold = 0.24;
double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4; double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4;
double scoreDiff = 8e-5, iouDiff = 1e-5; double scoreDiff = 8e-5, iouDiff = 1e-5;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 1e-2; scoreDiff = 1e-2;
iouDiff = 1.6e-2; iouDiff = 1.6e-2;
@ -392,7 +392,7 @@ TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
double confThreshold = 0.24; double confThreshold = 0.24;
double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15; double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15;
double scoreDiff = 8e-5, iouDiff = 1e-5; double scoreDiff = 8e-5, iouDiff = 1e-5;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 1e-2; scoreDiff = 1e-2;
iouDiff = 1.6e-2; iouDiff = 1.6e-2;
@ -443,7 +443,7 @@ TEST_P(Test_Model, DetectionOutput)
double scoreDiff = default_l1, iouDiff = 1e-5; double scoreDiff = default_l1, iouDiff = 1e-5;
float confThreshold = 0.8; float confThreshold = 0.8;
double nmsThreshold = 0.0; double nmsThreshold = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
if (backend == DNN_BACKEND_OPENCV) if (backend == DNN_BACKEND_OPENCV)
scoreDiff = 4e-3; scoreDiff = 4e-3;
@ -495,7 +495,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
Size size{300, 300}; Size size{300, 300};
double scoreDiff = 1e-5, iouDiff = 1e-5; double scoreDiff = 1e-5, iouDiff = 1e-5;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 1.7e-2; scoreDiff = 1.7e-2;
iouDiff = 6.91e-2; iouDiff = 6.91e-2;
@ -522,6 +522,8 @@ TEST_P(Test_Model, Keypoints_pose)
{ {
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
#ifdef HAVE_INF_ENGINE #ifdef HAVE_INF_ENGINE
if (target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_MYRIAD)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@ -569,7 +571,7 @@ TEST_P(Test_Model, Keypoints_face)
// Ref. Range: [-1.1784188, 1.7758257] // Ref. Range: [-1.1784188, 1.7758257]
float norm = 1e-4; float norm = 1e-4;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
norm = 5e-3; norm = 5e-3;
if (target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_MYRIAD)
{ {
@ -605,7 +607,7 @@ TEST_P(Test_Model, Detection_normalized)
scoreDiff = 3e-4; scoreDiff = 3e-4;
iouDiff = 0.018; iouDiff = 0.018;
} }
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 5e-3; scoreDiff = 5e-3;
iouDiff = 0.09; iouDiff = 0.09;
@ -654,7 +656,7 @@ TEST_P(Test_Model, Segmentation)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
#endif #endif
if ((backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if ((backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
|| (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)) || (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16))
{ {
norm = 2.0f; // l1 = 0.01 lInf = 2 norm = 2.0f; // l1 = 0.01 lInf = 2
@ -741,6 +743,8 @@ TEST_P(Test_Model, TextDetectionByDB)
{ {
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
std::string imgPath = _tf("text_det_test1.png"); std::string imgPath = _tf("text_det_test1.png");
std::string weightPathDB = _tf("onnx/models/DB_TD500_resnet50.onnx", false); std::string weightPathDB = _tf("onnx/models/DB_TD500_resnet50.onnx", false);
@ -801,7 +805,7 @@ TEST_P(Test_Model, TextDetectionByEAST)
double eps_size = 5/*pixels*/; double eps_size = 5/*pixels*/;
double eps_angle = 1; double eps_angle = 1;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
eps_center = 10; eps_center = 10;
eps_size = 25; eps_size = 25;

@ -957,7 +957,7 @@ public:
backend = get<0>(get<1>(GetParam())); backend = get<0>(get<1>(GetParam()));
target = get<1>(get<1>(GetParam())); target = get<1>(get<1>(GetParam()));
if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
default_l1 = 7e-3; default_l1 = 7e-3;
default_lInf = 2e-2; default_lInf = 2e-2;

@ -2179,7 +2179,7 @@ TEST_P(Test_ONNX_nets, TinyYolov2)
// output range: [-11; 8] // output range: [-11; 8]
double l1 = default_l1, lInf = default_lInf; double l1 = default_l1, lInf = default_lInf;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 0.02; l1 = 0.02;
lInf = 0.2; lInf = 0.2;

@ -486,6 +486,11 @@ TEST_P(Test_TensorFlow_layers, slim_batch_norm)
l1 = 0.005; l1 = 0.005;
lInf = 0.33; lInf = 0.33;
} }
else if (target == DNN_TARGET_CPU_FP16)
{
l1 = 0.041;
lInf = 0.37;
}
runTensorFlowNet("slim_batch_norm", false, l1, lInf); runTensorFlowNet("slim_batch_norm", false, l1, lInf);
} }
@ -710,6 +715,9 @@ TEST_P(Test_TensorFlow_layers, matmul)
{ {
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
runTensorFlowNet("matmul"); runTensorFlowNet("matmul");
runTensorFlowNet("nhwc_transpose_reshape_matmul"); runTensorFlowNet("nhwc_transpose_reshape_matmul");
// Reference output values are in range [-5.688, 4.484] // Reference output values are in range [-5.688, 4.484]
@ -723,6 +731,8 @@ TEST_P(Test_TensorFlow_layers, batch_matmul)
{ {
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
runTensorFlowNet("batch_matmul"); runTensorFlowNet("batch_matmul");
} }
@ -730,6 +740,8 @@ TEST_P(Test_TensorFlow_layers, square)
{ {
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
runTensorFlowNet("square"); runTensorFlowNet("square");
} }
@ -924,7 +936,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
Mat out = net.forward(); Mat out = net.forward();
double scoreDiff = default_l1, iouDiff = default_lInf; double scoreDiff = default_l1, iouDiff = default_lInf;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.01; scoreDiff = 0.01;
iouDiff = 0.1; iouDiff = 0.1;
@ -971,7 +983,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384); 0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
double scoreDiff = default_l1, iouDiff = default_lInf; double scoreDiff = default_l1, iouDiff = default_lInf;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.0097; scoreDiff = 0.0097;
iouDiff = 0.09; iouDiff = 0.09;
@ -1004,7 +1016,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy")); Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
float scoreDiff = 1.5e-5, iouDiff = 1e-3; float scoreDiff = 1.5e-5, iouDiff = 1e-3;
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3; float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.011; scoreDiff = 0.011;
iouDiff = 0.012; iouDiff = 0.012;
@ -1053,6 +1065,8 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28)
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
checkBackend(); checkBackend();
@ -1085,6 +1099,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28)
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff); normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff);
} }
} }
@ -1164,6 +1181,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_resnet50_coco_2018_01_28)
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff); normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff);
} }
} }
@ -1191,7 +1211,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
Mat out = net.forward(); Mat out = net.forward();
double scoreDiff = 1.1e-5, iouDiff = default_lInf; double scoreDiff = 1.1e-5, iouDiff = default_lInf;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 0.048; scoreDiff = 0.048;
iouDiff = 0.058; iouDiff = 0.058;
@ -1230,7 +1250,7 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494, 0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801); 0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
double scoreDiff = 3.4e-3, iouDiff = 1e-2; double scoreDiff = 3.4e-3, iouDiff = 1e-2;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
scoreDiff = 4e-3; scoreDiff = 4e-3;
iouDiff = 0.024; iouDiff = 0.024;
@ -1317,6 +1337,11 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
lInf_scores = 0.1; lInf_scores = 0.1;
l1_geometry = 0.3; lInf_geometry = 7; l1_geometry = 0.3; lInf_geometry = 7;
} }
else if (target == DNN_TARGET_CPU_FP16)
{
lInf_scores = 0.1;
l1_geometry = 0.28; lInf_geometry = 5.94;
}
else else
{ {
l1_geometry = 1e-4, lInf_geometry = 4.3e-3; l1_geometry = 1e-4, lInf_geometry = 4.3e-3;
@ -1360,6 +1385,10 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_pad_and_concat)
TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_padding_valid) TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_padding_valid)
{ {
float l1 = 0.00078, lInf = 0.012; float l1 = 0.00078, lInf = 0.012;
if (target == DNN_TARGET_CPU_FP16)
l1 = 0.00083;
runTensorFlowNet("fp16_padding_valid", false, l1, lInf); runTensorFlowNet("fp16_padding_valid", false, l1, lInf);
} }
TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_even) TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_even)
@ -1407,8 +1436,13 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_odd_valid)
TEST_P(Test_TensorFlow_layers, fp16_padding_same) TEST_P(Test_TensorFlow_layers, fp16_padding_same)
{ {
float l1 = 7e-4, lInf = 4e-3;
if (target == DNN_TARGET_CPU_FP16)
lInf = 5e-3;
// Reference output values are in range [-3.504, -0.002] // Reference output values are in range [-3.504, -0.002]
runTensorFlowNet("fp16_padding_same", false, 7e-4, 4e-3); runTensorFlowNet("fp16_padding_same", false, l1, lInf);
} }
TEST_P(Test_TensorFlow_layers, defun) TEST_P(Test_TensorFlow_layers, defun)
@ -1450,6 +1484,9 @@ TEST_P(Test_TensorFlow_layers, lstm)
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
runTensorFlowNet("lstm", true); runTensorFlowNet("lstm", true);
runTensorFlowNet("lstm", true, 0.0, 0.0, true); runTensorFlowNet("lstm", true, 0.0, 0.0, true);
} }
@ -1771,8 +1808,8 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
Mat outDetections = outs[0]; Mat outDetections = outs[0];
Mat outMasks = outs[1]; Mat outMasks = outs[1];
double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.2 : 2e-5; double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.2 : 2e-5;
double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.018 : default_lInf; double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.018 : default_lInf;
normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff); normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff);
// Output size of masks is NxCxHxW where // Output size of masks is NxCxHxW where
@ -1805,7 +1842,7 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
double inter = cv::countNonZero(masks & refMasks); double inter = cv::countNonZero(masks & refMasks);
double area = cv::countNonZero(masks | refMasks); double area = cv::countNonZero(masks | refMasks);
EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.98 : 0.99); EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.98 : 0.99);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
expectNoFallbacks(net); expectNoFallbacks(net);
@ -1815,6 +1852,7 @@ TEST_P(Test_TensorFlow_nets, EfficientDet)
{ {
if (target != DNN_TARGET_CPU) if (target != DNN_TARGET_CPU)
{ {
if (target == DNN_TARGET_CPU_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL); if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);

@ -113,7 +113,7 @@ TEST_P(Test_Torch_layers, run_convolution)
{ {
// Output reference values are in range [23.4018, 72.0181] // Output reference values are in range [23.4018, 72.0181]
double l1 = default_l1, lInf = default_lInf; double l1 = default_l1, lInf = default_lInf;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 0.08; l1 = 0.08;
lInf = 0.43; lInf = 0.43;
@ -132,6 +132,8 @@ TEST_P(Test_Torch_layers, run_pool_max)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_CUDA_FP16) if (target == DNN_TARGET_CUDA_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
double l1 = 0.0, lInf = 0.0; double l1 = 0.0, lInf = 0.0;
runTorchNet("net_pool_max", "", true, false, true, l1, lInf); runTorchNet("net_pool_max", "", true, false, true, l1, lInf);
} }
@ -158,7 +160,7 @@ TEST_P(Test_Torch_layers, run_reshape_single_sample)
{ {
// Reference output values in range [14.4586, 18.4492]. // Reference output values in range [14.4586, 18.4492].
double l1 = default_l1, lInf = default_lInf; double l1 = default_l1, lInf = default_lInf;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 0.033; l1 = 0.033;
lInf = 0.05; lInf = 0.05;
@ -175,6 +177,8 @@ TEST_P(Test_Torch_layers, run_linear)
{ {
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
runTorchNet("net_linear_2d"); runTorchNet("net_linear_2d");
} }
@ -186,7 +190,7 @@ TEST_P(Test_Torch_layers, run_concat)
TEST_P(Test_Torch_layers, run_depth_concat) TEST_P(Test_Torch_layers, run_depth_concat)
{ {
double lInf = 0.0; double lInf = 0.0;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
lInf = 0.032; lInf = 0.032;
} }
@ -252,7 +256,7 @@ TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif #endif
double l1 = 0.0, lInf = 0.0; double l1 = 0.0, lInf = 0.0;
if (target == DNN_TARGET_OPENCL_FP16) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 0.046; l1 = 0.046;
lInf = 0.023; lInf = 0.023;
@ -369,7 +373,7 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
// Reference output values are in range [-0.17212, 0.263492] // Reference output values are in range [-0.17212, 0.263492]
// on Myriad problem layer: l4_Pooling - does not use pads_begin // on Myriad problem layer: l4_Pooling - does not use pads_begin
float l1 = 1e-5, lInf = 1e-3; float l1 = 1e-5, lInf = 1e-3;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
{ {
l1 = 2e-3; l1 = 2e-3;
lInf = 5e-3; lInf = 5e-3;
@ -431,6 +435,8 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
throw SkipTestException(""); throw SkipTestException("");
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
if (target == DNN_TARGET_CPU_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020010000) #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@ -562,6 +568,10 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
{ {
normAssert(out, refBlob, "", 0.6, 25); normAssert(out, refBlob, "", 0.6, 25);
} }
else if (target == DNN_TARGET_CPU_FP16)
{
normAssert(out, refBlob, "", 0.62, 25);
}
else else
normAssert(out, refBlob, "", 0.5, 1.1); normAssert(out, refBlob, "", 0.5, 1.1);
} }

Loading…
Cancel
Save