Merge pull request #23192 from zihaomu:clean_up_SIMD_code

### Purpose of this PR:
- Move all dispatch and SIMD code of `convolution layer` into `simd.hpp` file.
- Support Winograd at AVX-only machine.
- Re-name the folder from `fast_conv` to `cpu_kernels`. In the future, we can put other layers of CPU optimization into it, like `GEMM` or `MatMul`.

## Performance Test
Since this patch just focuses on the code style, the performance is expected as the same as before.
Test with the following script: 
`./bin/opencv_perf_dnn '--gtest_filter=*conv*' --gtest_output="xml:../1-0th.xml" --perf_threads=1`

### Test on X86 platform
Min (ms)
|Name of Test|4.x | patch | 4.x vs patch (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|0.98|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|0.95|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.97|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.002|1.04|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.002|0.002|0.94|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.040|0.044|0.93|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.010|0.010|1.00|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.106|0.103|1.03|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.041|0.040|1.03|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.340|0.329|1.03|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.590|0.567|1.04|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.374|1.314|1.05|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.715|3.528|1.05|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.181|1.166|1.01|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.689|2.587|1.04|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.754|4.500|1.06|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.612|9.112|1.05|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.000|64.676|1.07|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|20.248|18.451|1.10|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.395|1.392|1.00|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|1.990|1.984|1.00|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.393|1.360|1.02|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.813|1.744|1.04|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.190|1.191|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.286|1.284|1.00|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.295|2.279|1.01|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.322|1.331|0.99|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.784|3.533|1.07|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.838|1.844|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.957|1.959|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.596|2.573|1.01|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.183|4.083|1.02|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.413|2.406|1.00|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.538|2.546|1.00|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.972|2.980|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.452|3.464|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.082|3.105|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.043|3.919|1.03|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.538|5.531|1.00|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.393|3.418|0.99|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.325|4.234|1.02|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.009|5.908|1.02|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.557|6.376|1.03|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|10.114|9.472|1.07|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.373|9.879|1.05|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.782|11.624|1.10|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|90.931|90.552|1.00|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|6.091|5.818|1.05|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|7.083|6.643|1.07|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.054|5.059|1.00|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|5.005|4.931|1.02|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|4.951|5.065|0.98|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.957|11.293|1.06|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.328|5.250|1.01|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.544|5.292|1.05|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.186|5.893|1.05|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|6.153|5.834|1.05|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.154|8.107|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.699|12.256|1.04|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|11.355|11.217|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.062|17.814|1.07|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.820|6.531|1.04|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.502|13.483|1.08|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.270|6.123|1.02|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|13.173|12.451|1.06|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.326|7.652|1.09|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.605|16.465|1.07|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.675|14.771|1.06|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.420|0.423|0.99|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.788|6.491|1.05|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.456|6.168|1.05|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.263|0.261|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.690|7.398|1.04|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.200|0.202|0.99|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.542|10.464|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.876|10.728|1.01|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.194|6.768|1.06|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|7.099|6.731|1.05|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.147|0.162|0.91|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|18.558|17.141|1.08|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.641|7.219|1.06|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.666|20.999|1.08|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.523|7.921|1.08|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.514|8.109|1.05|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.300|7.878|1.05|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.403|13.131|1.02|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.920|8.357|1.07|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|28.827|27.616|1.04|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|12.895|12.670|1.02|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.120|13.078|1.08|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.541|27.582|1.00|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.367|31.140|1.04|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|14.934|14.910|1.00|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.289|18.491|0.99|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|37.857|36.845|1.03|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.402|36.566|1.02|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.031|19.164|0.99|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.019|19.135|0.99|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.077|19.400|1.03|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.883|21.302|1.03|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.288|49.851|1.03|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|27.349|28.359|0.96|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|24.915|25.130|0.99|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.488|25.899|0.98|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.346|27.390|1.00|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.033|28.301|0.99|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|50.216|49.970|1.00|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|29.670|29.513|1.01|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|50.565|49.634|1.02|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|37.900|37.814|1.00|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|41.367|39.742|1.04|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|49.128|50.350|0.98|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|79.643|80.645|0.99|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.439|40.895|1.01|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.504|46.220|1.01|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.086|96.842|1.01|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|102.447|97.299|1.05|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|145.047|144.996|1.00|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|206.104|195.543|1.05|


### Test on M1(ARM) platform
|Name of Test|4.x|patch|4.x vs patch (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|0.97|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|0.94|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.92|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.003|0.003|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.031|0.031|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.009|0.009|1.00|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.066|1.01|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.102|0.102|1.00|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.328|1.00|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.693|0.747|0.93|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.268|1.266|1.00|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.530|3.581|0.99|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.186|1.188|1.00|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.682|2.683|1.00|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.490|4.501|1.00|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.914|8.938|1.00|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.819|69.876|1.00|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|24.058|22.420|1.07|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.240|2.236|1.00|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.132|3.136|1.00|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.920|1.919|1.00|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.343|2.346|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.234|1.116|1.11|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.109|1.121|0.99|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.197|3.084|1.04|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.123|1.148|0.98|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.836|5.061|0.96|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.535|1.463|1.05|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.756|1.584|1.11|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.821|1.820|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|7.049|6.672|1.06|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.967|1.922|1.02|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.943|1.977|0.98|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.464|2.310|1.07|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.860|2.904|0.98|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.428|2.483|0.98|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|2.955|2.983|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.328|4.484|0.97|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.712|2.778|0.98|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.205|3.331|0.96|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|4.193|4.412|0.95|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|5.026|4.565|1.10|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.490|14.213|1.02|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|14.886|14.003|1.06|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.923|15.184|1.05|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|45.136|41.696|1.08|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.995|4.631|1.08|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.402|6.261|1.02|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|4.478|3.965|1.13|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.908|3.978|0.98|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|4.176|4.206|0.99|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|21.509|21.136|1.02|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|4.426|4.082|1.08|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.098|4.289|0.96|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.646|5.105|0.91|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.746|4.724|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|5.614|5.779|0.97|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|21.909|20.718|1.06|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.256|8.290|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|25.196|23.267|1.08|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.721|5.172|1.11|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|20.066|18.322|1.10|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.448|4.542|0.98|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.193|19.013|1.01|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|6.009|5.964|1.01|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|20.169|20.009|1.01|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.584|23.423|0.96|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.372|0.504|0.74|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.426|5.456|0.99|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|4.945|5.221|0.95|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.210|0.261|0.81|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.720|5.997|0.95|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.149|0.161|0.93|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.154|7.225|0.99|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.184|7.223|0.99|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.324|5.343|1.00|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.114|5.238|0.98|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.111|0.121|0.92|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|25.907|26.804|0.97|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.695|5.654|1.01|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.435|27.566|1.00|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.944|6.164|1.13|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|7.180|6.717|1.07|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.817|6.050|1.13|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.225|8.660|1.07|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|7.496|6.625|1.13|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.520|36.056|0.99|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.990|9.702|1.03|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.517|10.746|0.98|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.702|36.731|1.00|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|41.035|38.280|1.07|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.981|10.573|1.04|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.863|12.384|1.04|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.437|54.088|0.93|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|50.650|50.635|1.00|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|14.696|14.606|1.01|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|16.201|15.426|1.05|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|16.061|14.292|1.12|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|17.743|18.250|0.97|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.909|78.165|1.00|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.579|21.879|0.99|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.424|19.589|1.04|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.389|19.461|1.00|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.319|20.358|1.05|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.609|21.826|1.04|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|25.497|25.789|0.99|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.966|22.108|0.99|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|35.883|33.470|1.07|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|31.041|29.314|1.06|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|29.922|28.145|1.06|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|31.624|31.148|1.02|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|38.564|39.164|0.98|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|31.502|30.269|1.04|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|34.248|34.589|0.99|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|130.211|134.120|0.97|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.490|132.874|0.96|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|199.834|200.081|1.00|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|247.346|247.523|1.00|


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake


```
force_builders=Linux AVX2,Custom Win
build_image:Custom Win=msvs2019
CPU_BASELINE:Custom Win=AVX512_SKX
```
pull/23289/head^2
Zihao Mu 2 years ago committed by GitHub
parent c6e5f60525
commit e03e2e7f94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      modules/dnn/CMakeLists.txt
  2. 2
      modules/dnn/src/layers/convolution_layer.cpp
  3. 259
      modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
  4. 258
      modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
  5. 591
      modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
  6. 764
      modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
  7. 886
      modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
  8. 560
      modules/dnn/src/layers/cpu_kernels/convolution.cpp
  9. 40
      modules/dnn/src/layers/cpu_kernels/convolution.hpp
  10. 499
      modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
  11. 567
      modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
  12. 1153
      modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
  13. 561
      modules/dnn/src/layers/layers_common.simd.hpp

@ -10,6 +10,9 @@ set(the_description "Deep neural network module. It allows to load models from d
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2)
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)

@ -72,7 +72,7 @@ using namespace cv::dnn::ocl4dnn;
using namespace cv::dnn::cuda4dnn;
#endif
#include "fast_convolution/fast_convolution.hpp"
#include "cpu_kernels/convolution.hpp"
namespace cv
{

@ -0,0 +1,259 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
#if !CV_FMA3 // AVX workaround
#undef _mm256_fmadd_ps
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
CV_Assert(convMR == 4 && convNR == 24);
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
__m256 c10 = c00, c11 = c00, c12 = c00;
__m256 c20 = c00, c21 = c00, c22 = c00;
__m256 c30 = c00, c31 = c00, c32 = c00;
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
c00 = _mm256_fmadd_ps(b0, a0, c00);
c01 = _mm256_fmadd_ps(b1, a0, c01);
c02 = _mm256_fmadd_ps(b2, a0, c02);
c10 = _mm256_fmadd_ps(b0, a1, c10);
c11 = _mm256_fmadd_ps(b1, a1, c11);
c12 = _mm256_fmadd_ps(b2, a1, c12);
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
c20 = _mm256_fmadd_ps(b0, a0, c20);
c21 = _mm256_fmadd_ps(b1, a0, c21);
c22 = _mm256_fmadd_ps(b2, a0, c22);
c30 = _mm256_fmadd_ps(b0, a1, c30);
c31 = _mm256_fmadd_ps(b1, a1, c31);
c32 = _mm256_fmadd_ps(b2, a1, c32);
}
if (!init_c)
{
c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
}
_mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
_mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
_mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
_mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
_mm256_zeroupper();
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
// NEON code work around.
namespace opt_NEON
{
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
#if CV_NEON_AARCH64
if (convMR == 4 && convNR == 28) // AARCH64
{
float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00;
float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10;
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
for( int p = 0; p < np; p++, a += convMR, b += convNR )
{
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
b0 = vld1q_f32(b + 24);
c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
}
if (!init_c)
{
c00 = vaddq_f32(c00, vld1q_f32(c));
c01 = vaddq_f32(c01, vld1q_f32(c + 4));
c02 = vaddq_f32(c02, vld1q_f32(c + 8));
c03 = vaddq_f32(c03, vld1q_f32(c + 12));
c04 = vaddq_f32(c04, vld1q_f32(c + 16));
c05 = vaddq_f32(c05, vld1q_f32(c + 20));
c06 = vaddq_f32(c06, vld1q_f32(c + 24));
c10 = vaddq_f32(c10, vld1q_f32(c + ldc));
c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4));
c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8));
c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12));
c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16));
c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20));
c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24));
c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2));
c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4));
c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8));
c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12));
c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16));
c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20));
c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24));
c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3));
c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4));
c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8));
c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12));
c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16));
c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20));
c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24));
}
vst1q_f32(c, c00); vst1q_f32(c+4, c01);
vst1q_f32(c+8, c02); vst1q_f32(c+12, c03);
vst1q_f32(c+16, c04); vst1q_f32(c+20, c05);
vst1q_f32(c+24, c06);
vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11);
vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13);
vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15);
vst1q_f32(c+ldc+24, c16);
vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21);
vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23);
vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25);
vst1q_f32(c+ldc*2+24, c26);
vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31);
vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33);
vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35);
vst1q_f32(c+ldc*3+24, c36);
}
else
#endif
if (convMR == 4 && convNR == 12) // ARMv7
{
float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
c1 = vmlaq_lane_f32(c1, b1, a0, 0);
c2 = vmlaq_lane_f32(c2, b2, a0, 0);
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
c4 = vmlaq_lane_f32(c4, b1, a0, 1);
c5 = vmlaq_lane_f32(c5, b2, a0, 1);
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
c7 = vmlaq_lane_f32(c7, b1, a1, 0);
c8 = vmlaq_lane_f32(c8, b2, a1, 0);
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
c10 = vmlaq_lane_f32(c10, b1, a1, 1);
c11 = vmlaq_lane_f32(c11, b2, a1, 1);
}
if (!init_c)
{
c0 = vaddq_f32(c0, vld1q_f32(c));
c1 = vaddq_f32(c1, vld1q_f32(c + 4));
c2 = vaddq_f32(c2, vld1q_f32(c + 8));
c3 = vaddq_f32(c3, vld1q_f32(c + ldc));
c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4));
c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8));
c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2));
c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4));
c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8));
c9 = vaddq_f32(c9 , vld1q_f32(c + ldc * 3));
c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4));
c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8));
}
vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2);
vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5);
vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
}
else
CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
}
#endif
}
}} // namespace cv::dnn

@ -2,20 +2,147 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/
#include "../../precomp.hpp"
#include "fast_convolution.hpp"
#include "../layers_common.hpp"
#include "convolution.hpp"
#include "conv_depthwise.simd.hpp"
#include "layers/cpu_kernels/conv_depthwise.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace dnn {
static void depthWiseBlockConv2D(const float* wptr,
void depthWiseBlockConv2D(const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW, bool fusedAdd);
void depthWiseBlockConv1D(const float* wptr,
int kernel_w, int stride_w, int dilation_w, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_, int width,
float* outptr_,
int out_d, int outW, bool fusedAdd);
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
const std::vector<float>& reluslope, bool fusedAdd)
{
Mat input = _input.getMat();
Mat output = _output.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
CV_Assert(inputShape.size() == outputShape.size());
int conv_dim = conv->conv_dim;
CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) &&
"DNN: Currently we do not support depth-wise for Convolution 3D!");
ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr;
int N = inputShape[0], C = inputShape[1];
int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
int Wi = inputShape[inputShape.size() - 1];
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
int W0 = outputShape[outputShape.size() - 1];
int ngroups = conv->ngroups;
const size_t inp_planesize = (size_t) Hi * Wi;
const size_t out_planesize = (size_t) H0 * W0;
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
int stride_h = conv->stride_h, stride_w = conv->stride_w;
int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
int ksize = Hk * Wk;
const int VEC_NLANES = 32;
int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
const float *inp = input.ptr<float>();
float *out = output.ptr<float>();
#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
// TODO: remove the following limitation, need change code in conv_depthwise.simd.hpp.
bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
#endif
std::vector<int> ofstab_(3 * ksize, 0);
int *ofstab = ofstab_.data();
int *yxtab = ofstab + ksize;
for (int k = 0; k < ksize; k++)
{
int y = k < ksize ? k / Wk : 0;
int x = k < ksize ? k % Wk : 0;
int dy = y * dilation_h, dx = x * dilation_w;
yxtab[k * 2] = dy;
yxtab[k * 2 + 1] = dx;
ofstab[k] = dy * Wi + dx;
}
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
const float* relu = reluslope.data();
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
parallel_for_(Range(0, N * C), [&](const Range &r0) {
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr0 = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;
const float *weights = weights0 + c * padded_ksize;
if (conv_dim == CONV_2D)
{
#if CV_TRY_AVX2
if(canRunOpt && conv->useAVX2)
opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_AVX
if(canRunOpt && conv->useAVX)
opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_RVV
if(canRunOpt && conv->useRVV)
opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
}
else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
{
depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
}
if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}});
}
/****************************************************************************************\
SIMD and no-SIMD code for depthWiseBlockConv
\****************************************************************************************/
void depthWiseBlockConv2D(const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
@ -199,7 +326,7 @@ static void depthWiseBlockConv2D(const float* wptr,
}
}
static void depthWiseBlockConv1D(const float* wptr,
void depthWiseBlockConv1D(const float* wptr,
int kernel_w, int stride_w, int dilation_w, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_, int width,
@ -332,114 +459,5 @@ static void depthWiseBlockConv1D(const float* wptr,
}
}
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
const std::vector<float>& reluslope, bool fusedAdd)
{
Mat input = _input.getMat();
Mat output = _output.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
CV_Assert(inputShape.size() == outputShape.size());
int conv_dim = conv->conv_dim;
CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) &&
"DNN: Currently we do not support depth-wise for Convolution 3D!");
ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr;
int N = inputShape[0], C = inputShape[1];
int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
int Wi = inputShape[inputShape.size() - 1];
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
int W0 = outputShape[outputShape.size() - 1];
int ngroups = conv->ngroups;
const size_t inp_planesize = (size_t) Hi * Wi;
const size_t out_planesize = (size_t) H0 * W0;
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
int stride_h = conv->stride_h, stride_w = conv->stride_w;
int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
int ksize = Hk * Wk;
const int VEC_NLANES = 32;
int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
const float *inp = input.ptr<float>();
float *out = output.ptr<float>();
#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
// TODO: remove the following limitation, need change code in layers_common.simd.hpp.
bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
#endif
std::vector<int> ofstab_(3 * ksize, 0);
int *ofstab = ofstab_.data();
int *yxtab = ofstab + ksize;
for (int k = 0; k < ksize; k++)
{
int y = k < ksize ? k / Wk : 0;
int x = k < ksize ? k % Wk : 0;
int dy = y * dilation_h, dx = x * dilation_w;
yxtab[k * 2] = dy;
yxtab[k * 2 + 1] = dx;
ofstab[k] = dy * Wi + dx;
}
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
const float* relu = reluslope.data();
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
parallel_for_(Range(0, N * C), [&](const Range &r0) {
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr0 = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;
const float *weights = weights0 + c * padded_ksize;
if (conv_dim == CONV_2D)
{
#if CV_TRY_AVX2
if(canRunOpt && conv->useAVX2)
opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_AVX
if(canRunOpt && conv->useAVX)
opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_RVV
if(canRunOpt && conv->useRVV)
opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
}
else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
{
depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
}
if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}});
}
}} // namespace cv::dnn

@ -0,0 +1,591 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void fastDepthwiseConv(const float* weights,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* bias, const float* relu,
const float* inptr,
int height, int width,
float* outptr,
int out_d, int outH, int outW);
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
#if !CV_FMA3 // AVX workaround
#undef _mm256_fmadd_ps
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = _mm256_loadu_ps(ptr);
__m256 t1 = _mm256_loadu_ps(ptr + 8);
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
a = _mm256_shuffle_ps(lo, hi, 0x88);
b = _mm256_shuffle_ps(lo, hi, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
v10 = _mm256_loadu_ps(imgptr1 + in_j),
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
v20 = _mm256_loadu_ps(imgptr2 + in_j),
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
_mm256_zeroupper();
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
/*
Example for load_deinterleave:
input: ptr[16] = {1,2,3, ... ,14,15,16}
output: a = {1, 3, 5, 7, 9, 11, 13, 15}
output: b = {2, 4, 6, 8,10, 12, 14, 16}
*/
static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl)
{
vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2);
vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask);
vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2);
vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2);
vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4();
vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2);
tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2);
tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2);
/* The following instructions have not to be supported by the GNU toolchain.
So we temporarily use store and load instead.
// a = vlmul_trunc_v_f32m4_f32m2(tempa);
// b = vlmul_trunc_v_f32m4_f32m2(tempb);
*/
cv::AutoBuffer<float> cvBuffer(sizeof(float)*vl*2);
float* buffer = (float*)cvBuffer.data();
vse32_v_f32m4(buffer, tempa, vl);
a = vle32_v_f32m2(buffer, vl);
vse32_v_f32m4(buffer, tempb, vl);
b = vle32_v_f32m2(buffer, vl);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
int vl;
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
int avl = outW1 - out_j;
if( stride_w == 1 )
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl),
v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl),
v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl),
v10 = vle32_v_f32m2(imgptr1 + in_j, vl),
v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl),
v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl),
v20 = vle32_v_f32m2(imgptr2 + in_j, vl),
v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl),
v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl);
vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
vout0 = vfadd_vf_f32m2(vout0, bias, vl);
vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
if (relu)
{
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
}
vse32_v_f32m2(outptr + out_j, vout0, vl);
}
else //stride_w == 2 && dilation_w == 1
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl);
vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl);
vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl);
vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl);
vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl);
vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl);
vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
vout0 = vfadd_vf_f32m2(vout0, bias, vl);
vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
if (relu)
{
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
}
vse32_v_f32m2(outptr + out_j, vout0, vl);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
#endif // CV_RVV
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = (__m256)__lasx_xvld(ptr, 0);
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
#endif // CV_LASX
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // namespace

@ -0,0 +1,764 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv_Winograd.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/
#include "../../precomp.hpp"
#include "convolution.hpp"
#include "conv_winograd_f63.simd.hpp"
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace dnn {
#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
/*Input transform*/
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
/*Output transform*/
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
Mat input = _input.getMat();
Mat output = _output.getMat();
Mat fusedAddMat = _fusedAddMat.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
int K = conv->K;
int H0 = outputShape[2], W0 = outputShape[3];
int pad_top = conv->pad_top;
int pad_left = conv->pad_left;
int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
const size_t inp_planesize = (size_t)Hi*Wi;
const size_t out_planesize = (size_t)H0*W0;
int blocks_per_row = (W0+CONV_WINO_STEP-1)/CONV_WINO_STEP;
int blocks_per_plane = ((H0+CONV_WINO_STEP-1)/CONV_WINO_STEP)*blocks_per_row;
int blocks_per_plane_aligned = ((blocks_per_plane +
CONV_WINO_IBLOCK-1)/CONV_WINO_IBLOCK)*CONV_WINO_IBLOCK;
size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*CONV_WINO_AREA;
AutoBuffer<float> _buf;
_buf.allocate(totalbufsize + VEC_ALIGN);
float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN);
float* inp = input.ptr<float>();
float* out = output.ptr<float>();
float* fusedAddPtr = fusedAddMat.empty() ? nullptr : fusedAddMat.ptr<float>();
// Phase 1. compute forward Winograd transforms for all input blocks,
// all input planes, all samples in the batch.
// [TODO]: maybe, if there are too many input channels, it makes sense to
// transform only part of input channels at once and then compute the partial
// accumulated sums (i.e. update the output buffers several times,
// rather than compute them in one pass).
parallel_for_(Range(0, ntasks), [&](const Range& r0) {
for (int task_id = r0.start; task_id < r0.end; task_id++)
{
int nc0 = (N*C)*task_id/ntasks;
int nc1 = (N*C)*(task_id+1)/ntasks;
for(; nc0 < nc1; nc0++)
{
int n = nc0 / C;
int c = nc0 - n*C;
int g = c / Cg;
c -= g*Cg;
for (int block_id = 0; block_id < blocks_per_plane; block_id += CONV_WINO_IBLOCK)
{
for (int db = 0; db < CONV_WINO_IBLOCK; db++)
{
size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned +
block_id)*Cg*CONV_WINO_AREA +
(c*CONV_WINO_IBLOCK + db)*CONV_WINO_ATOM_F32;
float* inwptr = (float*)wbuf_all + inwofs;
if (block_id + db < blocks_per_plane)
{
int y0 = (block_id + db) / blocks_per_row;
int x0 = (block_id + db) - y0 * blocks_per_row;
y0 = y0*CONV_WINO_STEP - pad_top;
x0 = x0*CONV_WINO_STEP - pad_left;
bool partial = y0 < 0 || y0 + CONV_WINO_SIZE > Hi ||
x0 < 0 || x0 + CONV_WINO_SIZE > Wi;
int dx1 = 0, dx2 = CONV_WINO_SIZE, dy1 = 0, dy2 = CONV_WINO_SIZE;
int inpstep = Wi;
float inpbuf[CONV_WINO_AREA];
float* inptr0 = (float*)inp + nc0*inp_planesize + y0*Wi + x0;
float* inptr = inptr0;
if (partial)
{
memset(inpbuf, 0, sizeof(inpbuf));
dy1 = -y0 > 0 ? -y0 : 0;
dy2 = Hi - y0 < CONV_WINO_SIZE ? Hi - y0 : CONV_WINO_SIZE;
if (dy2 < dy1) {dy2 = dy1 = 0;}
dx1 = -x0 > 0 ? -x0 : 0;
dx2 = Wi - x0 < CONV_WINO_SIZE ? Wi - x0 : CONV_WINO_SIZE;
if (dx2 < dx1) {dx2 = dx1 = 0;}
inptr0 -= y0*Wi + x0;
if (dx1 < dx2 && dy1 < dy2)
{
for(int dy = dy1; dy < dy2; dy++)
memcpy(&inpbuf[dy*CONV_WINO_SIZE + dx1],
inptr0 + (y0+dy)*Wi + (x0+dx1),
(dx2-dx1)*sizeof(inpbuf[0]));
}
inptr = inpbuf;
inpstep = CONV_WINO_SIZE;
}
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
else
#endif
#if CV_NEON && CV_NEON_AARCH64
if (conv->useNEON)
opt_NEON::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
else
#endif
winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
}
else
{
for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, inwptr += CONV_WINO_IBLOCK*CONV_WINO_ATOM_F32)
memset(inwptr, 0, CONV_WINO_ATOM_F32*sizeof(inwptr[0]));
}
}
}
}
}});
// Phase 2. compute elemwise-weighted sums of transformed blocks,
// apply inverse Winograd transforms to the sums,
// add bias, apply activation function if any and store the results.
parallel_for_(Range(0, ntasks), [&](const Range& r0) {
for (int task_id = r0.start; task_id < r0.end; task_id++)
{
size_t out_wbuf_size = CONV_WINO_AREA*CONV_WINO_KBLOCK*CONV_WINO_IBLOCK;
size_t outbuf_size = CONV_WINO_AREA;
AutoBuffer<float> out_wbuf_, outbuf_;
out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN);
float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN);
outbuf_.allocate(outbuf_size + VEC_ALIGN);
float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN);
memset(out_wbuf, 0, out_wbuf_size * sizeof(float));
memset(outbuf, 0, outbuf_size * sizeof(float));
int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks);
int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks);
for(; ngk0 < ngk1; ngk0++)
{
int n = ngk0 / (Kg_nblocks*ngroups);
int gk0 = ngk0 % (Kg_nblocks*ngroups);
int g = gk0 / Kg_nblocks;
int k0 = (gk0 % Kg_nblocks)*CONV_WINO_KBLOCK;
int k1 = k0 + CONV_WINO_KBLOCK <= Kg ? k0 + CONV_WINO_KBLOCK : Kg;
for (int block_id0 = 0; block_id0 < blocks_per_plane; block_id0 += CONV_WINO_IBLOCK)
{
int block_id1 = block_id0 + CONV_WINO_IBLOCK;
block_id1 = block_id1 < blocks_per_plane ? block_id1 : blocks_per_plane;
size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*CONV_WINO_AREA;
size_t wofs = (g*Kg_nblocks*CONV_WINO_KBLOCK + k0)*Cg*CONV_WINO_AREA;
float* inwptr = wbuf_all + inwofs;
const float* wptr = conv->weightsWinoBufPtr + wofs;
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
else
#endif
#if CV_NEON && CV_NEON_AARCH64
if (conv->useNEON)
opt_NEON::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
else
#endif
winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
for (int k = k0; k < k1; k++)
{
float biasv = conv->biasBuf[g*Kg + k];
for (int block_id = block_id0; block_id < block_id1; block_id++)
{
int y0 = block_id / blocks_per_row;
int x0 = block_id - y0 * blocks_per_row;
y0 = y0*CONV_WINO_STEP;
x0 = x0*CONV_WINO_STEP;
int dy1 = H0 - y0;
if (dy1 > CONV_WINO_STEP) dy1 = CONV_WINO_STEP;
int dx1 = W0 - x0;
if (dx1 > CONV_WINO_STEP) dx1 = CONV_WINO_STEP;
assert(dx1 > 0 && dy1 > 0);
bool partial = activ || dy1 < CONV_WINO_STEP || dx1 < CONV_WINO_STEP;
size_t outofs = (n*K + g*Kg + k)*out_planesize + y0*W0 + x0;
int outstep = W0;
float* outptr0 = (float*)out + outofs;
float* pbptr0 = fusedAddPtr ? fusedAddPtr + outofs : nullptr;
float *outptr = outptr0, *bpptr = pbptr0;
if (partial)
{
outptr = outbuf;
outstep = CONV_WINO_SIZE;
if (pbptr0)
{
bpptr = outbuf;
for (int y = 0; y < dy1; y++)
memcpy(outbuf + y*CONV_WINO_SIZE, pbptr0 + y*W0,
dx1*sizeof(pbptr0[0]));
}
}
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
else
#endif
#if CV_NEON && CV_NEON_AARCH64
if (conv->useNEON)
// NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
opt_NEON::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
else
#endif
winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
if (partial)
{
if (activ)
activ->forwardSlice(outptr, outptr, CONV_WINO_SIZE*CONV_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1);
for (int y = 0; y < dy1; y++)
memcpy(outptr0 + y*W0, outptr + y*CONV_WINO_SIZE,dx1*sizeof(outptr0[0]));
}
}
}
}
}
}});
return 1;
}
/****************************************************************************************\
SIMD for winograd function
\****************************************************************************************/
#if CV_SIMD128
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
{
#if 1
CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4);
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00;
v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00;
v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32)
{
v_float32x4 x0, x1, x2;
x0 = v_load(inwptr);
x1 = v_load(inwptr + 4);
x2 = v_load(inwptr + 8);
v_float32x4 w0 = v_load(wptr);
s00 = v_fma(w0, x0, s00);
s01 = v_fma(w0, x1, s01);
s02 = v_fma(w0, x2, s02);
w0 = v_load(wptr + 4);
s10 = v_fma(w0, x0, s10);
s11 = v_fma(w0, x1, s11);
s12 = v_fma(w0, x2, s12);
w0 = v_load(wptr + 8);
s20 = v_fma(w0, x0, s20);
s21 = v_fma(w0, x1, s21);
s22 = v_fma(w0, x2, s22);
w0 = v_load(wptr + 12);
s30 = v_fma(w0, x0, s30);
s31 = v_fma(w0, x1, s31);
s32 = v_fma(w0, x2, s32);
}
v_store(outbuf, s00);
v_store(outbuf + 1*64, s01);
v_store(outbuf + 2*64, s02);
v_store(outbuf + 3*64, s10);
v_store(outbuf + 4*64, s11);
v_store(outbuf + 5*64, s12);
v_store(outbuf + 6*64, s20);
v_store(outbuf + 7*64, s21);
v_store(outbuf + 8*64, s22);
v_store(outbuf + 9*64, s30);
v_store(outbuf + 10*64, s31);
v_store(outbuf + 11*64, s32);
}
#else
// Naive C++ code, the code should never be run here.
for (int atom_id = 0; atom_id < winoNatomF32;
atom_id++, outbuf += winoAtomF32)
{
float sumbuf[winoIblock*winoKblock*winoAtomF32];
memset(sumbuf, 0, sizeof(sumbuf));
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32)
{
for (int i = 0; i < winoKblock; i++)
{
for (int j = 0; j < winoIblock; j++)
{
int i_ = i*winoAtomF32;
int j_ = j*winoAtomF32;
int ij_ = i_*winoIblock + j_;
float s0 = inwptr[j_ + 0]*wptr[i_ + 0];
float s1 = inwptr[j_ + 1]*wptr[i_ + 1];
float s2 = inwptr[j_ + 2]*wptr[i_ + 2];
float s3 = inwptr[j_ + 3]*wptr[i_ + 3];
sumbuf[ij_ + 0] += s0;
sumbuf[ij_ + 1] += s1;
sumbuf[ij_ + 2] += s2;
sumbuf[ij_ + 3] += s3;
}
}
}
for (int ij = 0; ij < winoKblock*winoIblock; ij++)
{
int ij_ = ij*winoAtomF32;
int ij_out = ij*CONV_WINO_AREA;
outbuf[ij_out + 0] = sumbuf[ij_ + 0];
outbuf[ij_out + 1] = sumbuf[ij_ + 1];
outbuf[ij_out + 2] = sumbuf[ij_ + 2];
outbuf[ij_out + 3] = sumbuf[ij_ + 3];
}
}
#endif
}
/*Input transform*/
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
{
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
{
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
t00 = x40 - x20;
t01 = x41 - x21;
t10 = x30 - x50;
t11 = x31 - x51;
v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
v_float32x4 qm4_25 = v_setall_f32(-4.25f);
t00 = v_fma(x30, qm4_25, x10 + x50);
t01 = v_fma(x31, qm4_25, x11 + x51);
t10 = v_fma(x40, qm4_25, x20 + x60);
t11 = v_fma(x41, qm4_25, x21 + x61);
v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
t00 = v_fma(x10, q0_5, x50 + x50);
t01 = v_fma(x11, q0_5, x51 + x51);
t10 = v_fma(x20, q0_25, x60);
t11 = v_fma(x21, q0_25, x61);
t00 = v_fma(x30, qm2_5, t00);
t01 = v_fma(x31, qm2_5, t01);
t10 = v_fma(x40, qm1_25, t10);
t11 = v_fma(x41, qm1_25, t11);
v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
t00 = v_fma(x50, q0_5, x10 + x10);
t01 = v_fma(x51, q0_5, x11 + x11);
t10 = v_fma(x20, q4 , x60);
t11 = v_fma(x21, q4 , x61);
t00 = v_fma(x30, qm2_5, t00);
t01 = v_fma(x31, qm2_5, t01);
t10 = v_fma(x40, qm5 , t10);
t11 = v_fma(x41, qm5 , t11);
v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
/* Y: */
/* y00 y01 */
/* y10 y11 */
/* ... */
/* y70 y71 */
/* Y': */
/* y00 y40 */
/* y10 y50 */
/* y20 y60 */
/* y30 y70 */
/* y01 y41 */
/* y11 y51 */
/* y21 y61 */
/* y31 y71 */
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30);
v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31);
v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70);
v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71);
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = y01 - y20;
t01 = y41 - y60;
t10 = y30 - y11;
t11 = y70 - y51;
z00 = v_fma(t00, q5_25, y00 - y21);
z01 = v_fma(t01, q5_25, y40 - y61);
z70 = v_fma(t10, q5_25, y31 - y10);
z71 = v_fma(t11, q5_25, y71 - y50);
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = v_fma(y30, qm4_25, y10 + y11);
t01 = v_fma(y70, qm4_25, y50 + y51);
t10 = v_fma(y01, qm4_25, y20 + y21);
t11 = v_fma(y41, qm4_25, y60 + y61);
z10 = t00 + t10; z11 = t01 + t11;
z20 = t10 - t00; z21 = t11 - t01;
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = v_fma(y10, q0_5, y11 + y11);
t01 = v_fma(y50, q0_5, y51 + y51);
t10 = v_fma(y20, q0_25, y21);
t11 = v_fma(y60, q0_25, y61);
t00 = v_fma(y30, qm2_5, t00);
t01 = v_fma(y70, qm2_5, t01);
t10 = v_fma(y01, qm1_25, t10);
t11 = v_fma(y41, qm1_25, t11);
z30 = t00 + t10; z31 = t01 + t11;
z40 = t10 - t00; z41 = t11 - t01;
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = v_fma(y11, q0_5, y10 + y10);
t01 = v_fma(y51, q0_5, y50 + y50);
t10 = v_fma(y20, q4, y21);
t11 = v_fma(y60, q4, y61);
t00 = v_fma(y30, qm2_5, t00);
t01 = v_fma(y70, qm2_5, t01);
t10 = v_fma(y01, qm5, t10);
t11 = v_fma(y41, qm5, t11);
z50 = t00 + t10; z51 = t01 + t11;
z60 = t10 - t00; z61 = t11 - t01;
}
const int outstep = winoIblock*winoAtomF32*Cg;
v_store(outptr, z00);
v_store(outptr + outstep, z01);
v_store(outptr + outstep*2, z10);
v_store(outptr + outstep*3, z11);
v_store(outptr + outstep*4, z20);
v_store(outptr + outstep*5, z21);
v_store(outptr + outstep*6, z30);
v_store(outptr + outstep*7, z31);
v_store(outptr + outstep*8, z40);
v_store(outptr + outstep*9, z41);
v_store(outptr + outstep*10, z50);
v_store(outptr + outstep*11, z51);
v_store(outptr + outstep*12, z60);
v_store(outptr + outstep*13, z61);
v_store(outptr + outstep*14, z70);
v_store(outptr + outstep*15, z71);
}
/*Output transform*/
/* Inverse Winograd 8x8 transform:
out = (A'*inp*A)', where
inp is input 8x8 FP32 matrix,
A' is
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
inp is pre-loaded into xij registers,
out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1.
After the inverse transform is done, we add bias,
optionally add results from the earlier tensors (by-pass),
optionally apply activation function and then
store the final results.
That is, after both forward and then inverse transformation,
we get non-transposed result.
Of course, for the correct work of Winograd-based convolution,
the Winograd-transformed weights should also be transposed.
init_conv() (see OpConv.fx) takes care of that.
*/
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
{
v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
s12_0 = x10 + x20; s12_1 = x11 + x21;
s34_0 = x30 + x40; s34_1 = x31 + x41;
s56_0 = x50 + x60; s56_1 = x51 + x61;
v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) );
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
s12_0 = x10 - x20; s12_1 = x11 - x21;
s34_0 = x30 - x40; s34_1 = x31 - x41;
s56_0 = x50 - x60; s56_1 = x51 - x61;
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f);
v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
/* Y: */
/* y00 y01 */
/* y10 y11 */
/* ... */
/* y50 y51 */
/* 0 0 */
/* 0 0 */
/* Y': */
/* y00 y40 */
/* y10 y50 */
/* y20 y60 */
/* y30 y70 */
/* y01 y41 */
/* y11 y51 */
/* y21 y61 */
/* y31 y71 */
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30);
v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31);
v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70);
v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71);
s12_0 = y10 + y20; s12_1 = y50 + y60;
s34_0 = y30 + y01; s34_1 = y70 + y41;
s56_0 = y11 + y21; s56_1 = y51 + y61;
z00 = y00 + s12_0 + s34_0 + s56_0;
z01 = y40 + s12_1 + s34_1 + s56_1;
a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
s12_0 = y10 - y20; s12_1 = y50 - y60;
s34_0 = y30 - y01; s34_1 = y70 - y41;
s56_0 = y11 - y21; s56_1 = y51 - y61;
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y31 + s12_0));
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y71 + s12_1));
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f);
z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
v_float32x4 vbias = v_setall_f32(bias);
z00 += vbias;
z01 += vbias;
z10 += vbias;
z11 += vbias;
z20 += vbias;
z21 += vbias;
z30 += vbias;
z31 += vbias;
z40 += vbias;
z41 += vbias;
z50 += vbias;
z51 += vbias;
}
if (bpptr)
{
z00 += v_load(bpptr);
z01 += v_load_low(bpptr + 4);
z10 += v_load(bpptr + bpstep);
z11 += v_load_low(bpptr + bpstep + 4);
z20 += v_load(bpptr + bpstep*2);
z21 += v_load_low(bpptr + bpstep*2 + 4);
z30 += v_load(bpptr + bpstep*3);
z31 += v_load_low(bpptr + bpstep*3 + 4);
z40 += v_load(bpptr + bpstep*4);
z41 += v_load_low(bpptr + bpstep*4 + 4);
z50 += v_load(bpptr + bpstep*5);
z51 += v_load_low(bpptr + bpstep*5 + 4);
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval);
v_float32x4 vmin = v_setall_f32(minval);
z00 = v_min(v_max(z00, vmin), vmax);
z01 = v_min(v_max(z01, vmin), vmax);
z10 = v_min(v_max(z10, vmin), vmax);
z11 = v_min(v_max(z11, vmin), vmax);
z20 = v_min(v_max(z20, vmin), vmax);
z21 = v_min(v_max(z21, vmin), vmax);
z30 = v_min(v_max(z30, vmin), vmax);
z31 = v_min(v_max(z31, vmin), vmax);
z40 = v_min(v_max(z40, vmin), vmax);
z41 = v_min(v_max(z41, vmin), vmax);
z50 = v_min(v_max(z50, vmin), vmax);
z51 = v_min(v_max(z51, vmin), vmax);
}
v_store(outptr, z00);
v_store_low(outptr + 4, z01);
v_store(outptr + outstep, z10);
v_store_low(outptr + outstep + 4, z11);
v_store(outptr + outstep*2, z20);
v_store_low(outptr + outstep*2 + 4, z21);
v_store(outptr + outstep*3, z30);
v_store_low(outptr + outstep*3 + 4, z31);
v_store(outptr + outstep*4, z40);
v_store_low(outptr + outstep*4 + 4, z41);
v_store(outptr + outstep*5, z50);
v_store_low(outptr + outstep*5 + 4, z51);
}
#endif
#else
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
return 0;
}
#endif
}} // namespace cv::dnn

@ -0,0 +1,886 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
/* Accumulate */
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
/*Input transform*/
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
/*Output transform*/
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
#if !CV_FMA3 // AVX workaround
#undef _mm256_fmadd_ps
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
{
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 8);
if (iblock > 3)
{
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32)
{
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
__m256 x0, x1;
x0 = _mm256_load_ps(inwptr);
x1 = _mm256_load_ps(inwptr + 8);
s00 = _mm256_fmadd_ps(w0, x0, s00);
s01 = _mm256_fmadd_ps(w0, x1, s01);
s10 = _mm256_fmadd_ps(w1, x0, s10);
s11 = _mm256_fmadd_ps(w1, x1, s11);
s20 = _mm256_fmadd_ps(w2, x0, s20);
s21 = _mm256_fmadd_ps(w2, x1, s21);
s30 = _mm256_fmadd_ps(w3, x0, s30);
s31 = _mm256_fmadd_ps(w3, x1, s31);
x0 = _mm256_load_ps(inwptr + 16);
x1 = _mm256_load_ps(inwptr + 24);
s02 = _mm256_fmadd_ps(w0, x0, s02);
s03 = _mm256_fmadd_ps(w0, x1, s03);
s12 = _mm256_fmadd_ps(w1, x0, s12);
s13 = _mm256_fmadd_ps(w1, x1, s13);
s22 = _mm256_fmadd_ps(w2, x0, s22);
s23 = _mm256_fmadd_ps(w2, x1, s23);
s32 = _mm256_fmadd_ps(w3, x0, s32);
s33 = _mm256_fmadd_ps(w3, x1, s33);
x0 = _mm256_load_ps(inwptr + 32);
x1 = _mm256_load_ps(inwptr + 40);
s04 = _mm256_fmadd_ps(w0, x0, s04);
s05 = _mm256_fmadd_ps(w0, x1, s05);
s14 = _mm256_fmadd_ps(w1, x0, s14);
s15 = _mm256_fmadd_ps(w1, x1, s15);
s24 = _mm256_fmadd_ps(w2, x0, s24);
s25 = _mm256_fmadd_ps(w2, x1, s25);
s34 = _mm256_fmadd_ps(w3, x0, s34);
s35 = _mm256_fmadd_ps(w3, x1, s35);
}
_mm256_store_ps(outbuf, s00);
_mm256_store_ps(outbuf + 1*64, s01);
_mm256_store_ps(outbuf + 2*64, s02);
_mm256_store_ps(outbuf + 3*64, s03);
_mm256_store_ps(outbuf + 4*64, s04);
_mm256_store_ps(outbuf + 5*64, s05);
_mm256_store_ps(outbuf + 6*64, s10);
_mm256_store_ps(outbuf + 7*64, s11);
_mm256_store_ps(outbuf + 8*64, s12);
_mm256_store_ps(outbuf + 9*64, s13);
_mm256_store_ps(outbuf + 10*64, s14);
_mm256_store_ps(outbuf + 11*64, s15);
_mm256_store_ps(outbuf + 12*64, s20);
_mm256_store_ps(outbuf + 13*64, s21);
_mm256_store_ps(outbuf + 14*64, s22);
_mm256_store_ps(outbuf + 15*64, s23);
_mm256_store_ps(outbuf + 16*64, s24);
_mm256_store_ps(outbuf + 17*64, s25);
_mm256_store_ps(outbuf + 18*64, s30);
_mm256_store_ps(outbuf + 19*64, s31);
_mm256_store_ps(outbuf + 20*64, s32);
_mm256_store_ps(outbuf + 21*64, s33);
_mm256_store_ps(outbuf + 22*64, s34);
_mm256_store_ps(outbuf + 23*64, s35);
}
}
else
{
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00;
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00;
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00;
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32) {
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
__m256 x0, x1, x2;
x0 = _mm256_load_ps(inwptr);
x1 = _mm256_load_ps(inwptr + 8);
x2 = _mm256_load_ps(inwptr + 16);
s00 = _mm256_fmadd_ps(w0, x0, s00);
s01 = _mm256_fmadd_ps(w0, x1, s01);
s02 = _mm256_fmadd_ps(w0, x2, s02);
s10 = _mm256_fmadd_ps(w1, x0, s10);
s11 = _mm256_fmadd_ps(w1, x1, s11);
s12 = _mm256_fmadd_ps(w1, x2, s12);
s20 = _mm256_fmadd_ps(w2, x0, s20);
s21 = _mm256_fmadd_ps(w2, x1, s21);
s22 = _mm256_fmadd_ps(w2, x2, s22);
s30 = _mm256_fmadd_ps(w3, x0, s30);
s31 = _mm256_fmadd_ps(w3, x1, s31);
s32 = _mm256_fmadd_ps(w3, x2, s32);
}
_mm256_store_ps(outbuf, s00);
_mm256_store_ps(outbuf + 1*64, s01);
_mm256_store_ps(outbuf + 2*64, s02);
_mm256_store_ps(outbuf + 6*64, s10);
_mm256_store_ps(outbuf + 7*64, s11);
_mm256_store_ps(outbuf + 8*64, s12);
_mm256_store_ps(outbuf + 12*64, s20);
_mm256_store_ps(outbuf + 13*64, s21);
_mm256_store_ps(outbuf + 14*64, s22);
_mm256_store_ps(outbuf + 18*64, s30);
_mm256_store_ps(outbuf + 19*64, s31);
_mm256_store_ps(outbuf + 20*64, s32);
}
}
_mm256_zeroupper();
}
static inline
void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7)
{
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
__t0 = _mm256_unpacklo_ps(row0, row1);
__t1 = _mm256_unpackhi_ps(row0, row1);
__t2 = _mm256_unpacklo_ps(row2, row3);
__t3 = _mm256_unpackhi_ps(row2, row3);
__t4 = _mm256_unpacklo_ps(row4, row5);
__t5 = _mm256_unpackhi_ps(row4, row5);
__t6 = _mm256_unpacklo_ps(row6, row7);
__t7 = _mm256_unpackhi_ps(row6, row7);
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}
/*Input transform*/
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
{
__m256 x00 = _mm256_loadu_ps(inptr);
__m256 x10 = _mm256_loadu_ps(inptr + inpstep);
__m256 x20 = _mm256_loadu_ps(inptr + inpstep*2);
__m256 x30 = _mm256_loadu_ps(inptr + inpstep*3);
__m256 x40 = _mm256_loadu_ps(inptr + inpstep*4);
__m256 x50 = _mm256_loadu_ps(inptr + inpstep*5);
__m256 x60 = _mm256_loadu_ps(inptr + inpstep*6);
__m256 x70 = _mm256_loadu_ps(inptr + inpstep*7);
__m256 z00, z10, z20, z30, z40, z50, z60, z70;
{
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
__m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10;
t00 = _mm256_sub_ps(x40, x20);
t10 = _mm256_sub_ps(x30, x50);
__m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60));
__m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10));
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
__m256 qm4_25 = _mm256_set1_ps(-4.25f);
t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50));
t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60));
__m256 y10 = _mm256_add_ps(t00, t10);
__m256 y20 = _mm256_sub_ps(t10, t00);
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
__m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f);
__m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f);
t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50));
t10 = _mm256_fmadd_ps(x20, q0_25, x60);
t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
t10 = _mm256_fmadd_ps(x40, qm1_25, t10);
__m256 y30 = _mm256_add_ps(t00, t10);
__m256 y40 = _mm256_sub_ps(t10, t00);
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
__m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f);
t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10));
t10 = _mm256_fmadd_ps(x20, q4 , x60);
t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
t10 = _mm256_fmadd_ps(x40, qm5 , t10);
__m256 y50 = _mm256_add_ps(t00, t10);
__m256 y60 = _mm256_sub_ps(t10, t00);
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = _mm256_sub_ps(y40, y20);
t10 = _mm256_sub_ps(y30, y50);
z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60));
z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10));
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50));
t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60));
z10 = _mm256_add_ps(t00, t10);
z20 = _mm256_sub_ps(t10, t00);
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50));
t10 = _mm256_fmadd_ps(y20, q0_25, y60);
t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
t10 = _mm256_fmadd_ps(y40, qm1_25, t10);
z30 = _mm256_add_ps(t00, t10);
z40 = _mm256_sub_ps(t10, t00);
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10));
t10 = _mm256_fmadd_ps(y20, q4, y60);
t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
t10 = _mm256_fmadd_ps(y40, qm5, t10);
z50 = _mm256_add_ps(t00, t10);
z60 = _mm256_sub_ps(t10, t00);
}
const int outstep = winoIblock*winoAtomF32*Cg;
_mm256_storeu_ps(outptr, z00);
_mm256_storeu_ps(outptr + outstep, z10);
_mm256_storeu_ps(outptr + outstep*2, z20);
_mm256_storeu_ps(outptr + outstep*3, z30);
_mm256_storeu_ps(outptr + outstep*4, z40);
_mm256_storeu_ps(outptr + outstep*5, z50);
_mm256_storeu_ps(outptr + outstep*6, z60);
_mm256_storeu_ps(outptr + outstep*7, z70);
_mm256_zeroupper();
}
#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \
lowM = _mm256_castps256_ps128(z00); \
highM = _mm256_extractf128_ps(z00, 1); \
_mm_storeu_ps(ptr, lowM); \
_mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM))
/* Inverse Winograd 8x8 transform:
out = (A'*inp*A)', where
inp is input 8x8 FP32 matrix,
A' is
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
*/
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
__m256 x00 = _mm256_load_ps(inptr);
__m256 x10 = _mm256_load_ps(inptr + inpstep);
__m256 x20 = _mm256_load_ps(inptr + inpstep*2);
__m256 x30 = _mm256_load_ps(inptr + inpstep*3);
__m256 x40 = _mm256_load_ps(inptr + inpstep*4);
__m256 x50 = _mm256_load_ps(inptr + inpstep*5);
__m256 x60 = _mm256_load_ps(inptr + inpstep*6);
__m256 x70 = _mm256_load_ps(inptr + inpstep*7);
__m256 z00, z10, z20, z30, z40, z50;
{
__m256 s12_0, s34_0, s56_0;
s12_0 = _mm256_add_ps(x10, x20);
s34_0 = _mm256_add_ps(x30, x40);
s56_0 = _mm256_add_ps(x50, x60);
__m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
__m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
__m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
s12_0 = _mm256_sub_ps(x10, x20);
s34_0 = _mm256_sub_ps(x30, x40);
s56_0 = _mm256_sub_ps(x50, x60);
__m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0)));
__m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0));
__m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0));
__m256 y60 = _mm256_set1_ps(0.f), y70 = y60;
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
s12_0 = _mm256_add_ps(y10, y20);
s34_0 = _mm256_add_ps(y30, y40);
s56_0 = _mm256_add_ps(y50, y60);
z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
s12_0 = _mm256_sub_ps(y10, y20);
s34_0 = _mm256_sub_ps(y30, y40);
s56_0 = _mm256_sub_ps(y50, y60);
z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0)));
z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0));
z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0));
__m256 vbias = _mm256_set1_ps(bias);
z00 = _mm256_add_ps(vbias, z00);
z10 = _mm256_add_ps(vbias, z10);
z20 = _mm256_add_ps(vbias, z20);
z30 = _mm256_add_ps(vbias, z30);
z40 = _mm256_add_ps(vbias, z40);
z50 = _mm256_add_ps(vbias, z50);
}
if (bpptr)
{
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep));
z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2));
z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3));
z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4));
z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5));
}
if (ifMinMaxAct)
{
__m256 vmax = _mm256_set1_ps(maxval);
__m256 vmin = _mm256_set1_ps(minval);
z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax);
z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax);
z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax);
z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax);
z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax);
z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax);
}
__m128 lowM, highM;
STORE6_ELE_FROM_16(outptr, z00, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM);
_mm256_zeroupper();
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
// NEON code work around.
namespace opt_NEON
{
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && CV_NEON_AARCH64
/* Accumulate */
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
/*Input transform*/
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
/*Output transform*/
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
{
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
if (iblock > 3)
{
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32) {
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
float32x4_t x0, x1;
x0 = vld1q_f32(inwptr);
x1 = vld1q_f32(inwptr + 4);
s00 = vfmaq_f32(s00, w0, x0);
s01 = vfmaq_f32(s01, w0, x1);
s10 = vfmaq_f32(s10, w1, x0);
s11 = vfmaq_f32(s11, w1, x1);
s20 = vfmaq_f32(s20, w2, x0);
s21 = vfmaq_f32(s21, w2, x1);
s30 = vfmaq_f32(s30, w3, x0);
s31 = vfmaq_f32(s31, w3, x1);
x0 = vld1q_f32(inwptr + 8);
x1 = vld1q_f32(inwptr + 12);
s02 = vfmaq_f32(s02, w0, x0);
s03 = vfmaq_f32(s03, w0, x1);
s12 = vfmaq_f32(s12, w1, x0);
s13 = vfmaq_f32(s13, w1, x1);
s22 = vfmaq_f32(s22, w2, x0);
s23 = vfmaq_f32(s23, w2, x1);
s32 = vfmaq_f32(s32, w3, x0);
s33 = vfmaq_f32(s33, w3, x1);
x0 = vld1q_f32(inwptr + 16);
x1 = vld1q_f32(inwptr + 20);
s04 = vfmaq_f32(s04, w0, x0);
s05 = vfmaq_f32(s05, w0, x1);
s14 = vfmaq_f32(s14, w1, x0);
s15 = vfmaq_f32(s15, w1, x1);
s24 = vfmaq_f32(s24, w2, x0);
s25 = vfmaq_f32(s25, w2, x1);
s34 = vfmaq_f32(s34, w3, x0);
s35 = vfmaq_f32(s35, w3, x1);
}
vst1q_f32(outbuf, s00);
vst1q_f32(outbuf + 1*64, s01);
vst1q_f32(outbuf + 2*64, s02);
vst1q_f32(outbuf + 3*64, s03);
vst1q_f32(outbuf + 4*64, s04);
vst1q_f32(outbuf + 5*64, s05);
vst1q_f32(outbuf + 6*64, s10);
vst1q_f32(outbuf + 7*64, s11);
vst1q_f32(outbuf + 8*64, s12);
vst1q_f32(outbuf + 9*64, s13);
vst1q_f32(outbuf + 10*64, s14);
vst1q_f32(outbuf + 11*64, s15);
vst1q_f32(outbuf + 12*64, s20);
vst1q_f32(outbuf + 13*64, s21);
vst1q_f32(outbuf + 14*64, s22);
vst1q_f32(outbuf + 15*64, s23);
vst1q_f32(outbuf + 16*64, s24);
vst1q_f32(outbuf + 17*64, s25);
vst1q_f32(outbuf + 18*64, s30);
vst1q_f32(outbuf + 19*64, s31);
vst1q_f32(outbuf + 20*64, s32);
vst1q_f32(outbuf + 21*64, s33);
vst1q_f32(outbuf + 22*64, s34);
vst1q_f32(outbuf + 23*64, s35);
}
}
else
{
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32) {
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
float32x4_t x0, x1, x2;
x0 = vld1q_f32(inwptr);
x1 = vld1q_f32(inwptr + 4);
x2 = vld1q_f32(inwptr + 8);
s00 = vfmaq_f32(s00, w0, x0);
s01 = vfmaq_f32(s01, w0, x1);
s02 = vfmaq_f32(s02, w0, x2);
s10 = vfmaq_f32(s10, w1, x0);
s11 = vfmaq_f32(s11, w1, x1);
s12 = vfmaq_f32(s12, w1, x2);
s20 = vfmaq_f32(s20, w2, x0);
s21 = vfmaq_f32(s21, w2, x1);
s22 = vfmaq_f32(s22, w2, x2);
s30 = vfmaq_f32(s30, w3, x0);
s31 = vfmaq_f32(s31, w3, x1);
s32 = vfmaq_f32(s32, w3, x2);
}
vst1q_f32(outbuf, s00);
vst1q_f32(outbuf + 1*64, s01);
vst1q_f32(outbuf + 2*64, s02);
vst1q_f32(outbuf + 6*64, s10);
vst1q_f32(outbuf + 7*64, s11);
vst1q_f32(outbuf + 8*64, s12);
vst1q_f32(outbuf + 12*64, s20);
vst1q_f32(outbuf + 13*64, s21);
vst1q_f32(outbuf + 14*64, s22);
vst1q_f32(outbuf + 18*64, s30);
vst1q_f32(outbuf + 19*64, s31);
vst1q_f32(outbuf + 20*64, s32);
}
}
}
#define T4x4(a, b, c, d, tr0, tr1) \
tr0 = vtrnq_f32(a, b); \
tr1 = vtrnq_f32(c, d); \
a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
/*Input transform*/
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
{
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
{
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
t00 = vsubq_f32(x40, x20);
t01 = vsubq_f32(x41, x21);
t10 = vsubq_f32(x30, x50);
t11 = vsubq_f32(x31, x51);
float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
t10 = vfmaq_f32(x60, x20, q0_25);
t11 = vfmaq_f32(x61, x21, q0_25);
t00 = vfmaq_f32(t00, x30, qm2_5);
t01 = vfmaq_f32(t01, x31, qm2_5);
t10 = vfmaq_f32(t10, x40, qm1_25);
t11 = vfmaq_f32(t11, x41, qm1_25);
float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
t10 = vfmaq_f32(x60, x20, q4);
t11 = vfmaq_f32(x61, x21, q4);
t00 = vfmaq_f32(t00, x30, qm2_5);
t01 = vfmaq_f32(t01, x31, qm2_5);
t10 = vfmaq_f32(t10, x40, qm5);
t11 = vfmaq_f32(t11, x41, qm5);
float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
/* Y: */
/* y00 y01 */
/* y10 y11 */
/* ... */
/* y70 y71 */
/* Y': */
/* y00 y40 */
/* y10 y50 */
/* y20 y60 */
/* y30 y70 */
/* y01 y41 */
/* y11 y51 */
/* y21 y61 */
/* y31 y71 */
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
float32x4x2_t tr0, tr1;
T4x4(y00, y10, y20, y30, tr0, tr1);
T4x4(y01, y11, y21, y31, tr0, tr1);
T4x4(y40, y50, y60, y70, tr0, tr1);
T4x4(y41, y51, y61, y71, tr0, tr1);
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = vsubq_f32(y01, y20);
t01 = vsubq_f32(y41, y60);
t10 = vsubq_f32(y30, y11);
t11 = vsubq_f32(y70, y51);
z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
t10 = vfmaq_f32(y21, y20, q0_25);
t11 = vfmaq_f32(y61, y60, q0_25);
t00 = vfmaq_f32(t00, y30, qm2_5);
t01 = vfmaq_f32(t01, y70, qm2_5);
t10 = vfmaq_f32(t10, y01, qm1_25);
t11 = vfmaq_f32(t11, y41, qm1_25);
z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
t10 = vfmaq_f32(y21, y20, q4);
t11 = vfmaq_f32(y61, y60, q4);
t00 = vfmaq_f32(t00, y30, qm2_5);
t01 = vfmaq_f32(t01, y70, qm2_5);
t10 = vfmaq_f32(t10, y01, qm5);
t11 = vfmaq_f32(t11, y41, qm5);
z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
}
const int outstep = winoIblock*winoAtomF32*Cg;
vst1q_f32(outptr, z00);
vst1q_f32(outptr + outstep, z01);
vst1q_f32(outptr + outstep*2, z10);
vst1q_f32(outptr + outstep*3, z11);
vst1q_f32(outptr + outstep*4, z20);
vst1q_f32(outptr + outstep*5, z21);
vst1q_f32(outptr + outstep*6, z30);
vst1q_f32(outptr + outstep*7, z31);
vst1q_f32(outptr + outstep*8, z40);
vst1q_f32(outptr + outstep*9, z41);
vst1q_f32(outptr + outstep*10, z50);
vst1q_f32(outptr + outstep*11, z51);
vst1q_f32(outptr + outstep*12, z60);
vst1q_f32(outptr + outstep*13, z61);
vst1q_f32(outptr + outstep*14, z70);
vst1q_f32(outptr + outstep*15, z71);
}
/*Output transform*/
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
{
float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
s34_0, 32.f), s56_0, 1.f/32);
float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
s34_1, 32.f), s56_1, 1.f/32);
float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
/* Y: */
/* y00 y01 */
/* y10 y11 */
/* ... */
/* y50 y51 */
/* 0 0 */
/* 0 0 */
/* Y': */
/* y00 y40 */
/* y10 y50 */
/* y20 y60 */
/* y30 y70 */
/* y01 y41 */
/* y11 y51 */
/* y21 y61 */
/* y31 y71 */
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
float32x4x2_t tr0, tr1;
T4x4(y00, y10, y20, y30, tr0, tr1);
T4x4(y01, y11, y21, y31, tr0, tr1);
T4x4(y40, y50, y60, y70, tr0, tr1);
T4x4(y41, y51, y61, y71, tr0, tr1);
s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
s34_0, 32.f), s56_0, 1.f/32);
z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
s34_1, 32.f), s56_1, 1.f/32);
z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
float32x4_t vbias = vdupq_n_f32(bias);
z00 = vaddq_f32(z00, vbias);
z01 = vaddq_f32(z01, vbias);
z10 = vaddq_f32(z10, vbias);
z11 = vaddq_f32(z11, vbias);
z20 = vaddq_f32(z20, vbias);
z21 = vaddq_f32(z21, vbias);
z30 = vaddq_f32(z30, vbias);
z31 = vaddq_f32(z31, vbias);
z40 = vaddq_f32(z40, vbias);
z41 = vaddq_f32(z41, vbias);
z50 = vaddq_f32(z50, vbias);
z51 = vaddq_f32(z51, vbias);
}
if (bpptr)
{
float32x2_t zhalf = vdup_n_f32(0.f);
z00 = vaddq_f32(z00, vld1q_f32(bpptr));
z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
}
if (ifMinMaxAct)
{
float32x4_t vmax = vdupq_n_f32(maxval);
float32x4_t vmin = vdupq_n_f32(minval);
z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
}
vst1q_f32(outptr, z00);
vst1_f32(outptr + 4, vget_low_f32(z01));
vst1q_f32(outptr + outstep, z10);
vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
vst1q_f32(outptr + outstep*2, z20);
vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
vst1q_f32(outptr + outstep*3, z30);
vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
vst1q_f32(outptr + outstep*4, z40);
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
vst1q_f32(outptr + outstep*5, z50);
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
}
#endif
}
}} // namespace

@ -10,11 +10,19 @@
*/
#include "../../precomp.hpp"
#include "fast_convolution.hpp"
#include "fast_convolution.simd.hpp"
#include "convolution.hpp"
#include "conv_block.simd.hpp"
#include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace dnn {
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
const int convMR, const int convNR);
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
Ptr<FastConv> initFastConv(
InputArray _weightsMat,
float* srcBias,
@ -94,21 +102,15 @@ Ptr<FastConv> initFastConv(
}
}
conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? _FX_CONV_TYPE_DEPTHWISE :
conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? CONV_TYPE_DEPTHWISE :
useWinograd && (conv_dim == CONV_2D && (conv->useSIMD128 || conv->useAVX2 || conv->useNEON) &&
Hk == 3 && Wk == 3 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1) ?
_FX_CONV_TYPE_WINOGRAD3X3 :
(ifRunDepthWiseRemain ? _FX_CONV_TYPE_DEPTHWISE_REMAIN : _FX_CONV_TYPE_GENERIC);
CONV_TYPE_WINOGRAD3X3 :
(ifRunDepthWiseRemain ? CONV_TYPE_DEPTHWISE_REMAIN : CONV_TYPE_GENERIC);
#if !(CV_NEON || CV_SIMD128 || CV_TRY_AVX2)
if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available.
conv->conv_type = _FX_CONV_TYPE_GENERIC;
#endif
#if CV_TRY_AVX2
// Disabel Winograd when CV_TRY_AVX2 is true, but conv->useAVX2 is false.
if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && !conv->useAVX2)
conv->conv_type = _FX_CONV_TYPE_GENERIC;
if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available.
conv->conv_type = CONV_TYPE_GENERIC;
#endif
Mat weightsMat = _weightsMat.getMat();
@ -116,7 +118,7 @@ Ptr<FastConv> initFastConv(
const size_t wstep = weightsMat.step1();
float *srcWeights = (float *)weightsMat.data;
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE || conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
// Handle the Conv1D, Conv2D and Conv3D depth-wise.
// for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
@ -138,7 +140,7 @@ Ptr<FastConv> initFastConv(
weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
}});
}
else if(conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
{
static const float ktm[8][3] = {
{1.0f, 0.0f, 0.0f},
@ -156,24 +158,24 @@ Ptr<FastConv> initFastConv(
// where W is the size of Winograd-transformed kernel (8x8),
// ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
// KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE;
int ksize = CONV_WINO_KSIZE * CONV_WINO_KSIZE;
int Cg = C/ngroups;
int Kg = K/ngroups;
int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK;
size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA;
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA;
conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN);
conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
float* wptrWino = conv->weightsWinoBufPtr;
memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
parallel_for_(Range(0, K), [&](const Range& r0){
float kernelTm[_FX_WINO_AREA];
float kernelTm[CONV_WINO_AREA];
for (int k = r0.start; k < r0.end; k++)
{
int g = k / Kg;
int k_ = k - g*Kg;
int ki = k_ / _FX_WINO_KBLOCK;
int dk = k_ - ki*_FX_WINO_KBLOCK;
int ki = k_ / CONV_WINO_KBLOCK;
int dk = k_ - ki*CONV_WINO_KBLOCK;
for (int c = 0; c < Cg; c++)
{
@ -204,18 +206,18 @@ Ptr<FastConv> initFastConv(
}
// repack the data.
float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA +
(c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32;
for (int i = 0; i < _FX_WINO_NATOMS_F32; i++,
wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32)
float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
(c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32;
for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
{
CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0]));
CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0]));
}
}
}});
}
else if (conv->conv_type == _FX_CONV_TYPE_GENERIC)
else if (conv->conv_type == CONV_TYPE_GENERIC)
{
// The weights are packed as
// ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor
@ -372,7 +374,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
fusedAddMat = _output.getMat();
}
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE)
if (conv->conv_type == CONV_TYPE_DEPTHWISE)
{
// Depthwise-Convolution layer should not be followed by Add layer.
CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D));
@ -420,7 +422,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
else
activ = nullptr;
if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
{
CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D);
if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
@ -454,8 +456,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
int ksize = Dk*Hk*Wk;
bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1 &&
pad_front == 0 && pad_top == 0 && pad_left == 0;
bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1;
int DkHkWkCg = Dk*Hk*Wk*Cg;
std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
@ -504,14 +505,14 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
// Friendly to L1 cache
const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
const int C_BLOCK_SIZE = 256;
int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR, Kg_aligned = Kg_nblocks * CONV_MR;
int stripes_per_sample = ((int)out_planesize + CONV_NR - 1) / CONV_NR;
if (stripes_per_sample < ntasks * 4 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN)
if (stripes_per_sample < ntasks * 4 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN)
{
MAX_STRIPES = 1;
stripes_per_sample = 1;
@ -555,7 +556,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int k0, k1;
int zyx0, zyx_limit, zyx_block_limit = 0;
if (stripes_per_sample == 1 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN)
if (stripes_per_sample == 1 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN)
{
k0 = kzyx0 * CONV_MR;
k1 = kzyx1 * CONV_MR;
@ -618,7 +619,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
}
}
}
else if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
else if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
CV_Assert(Cg == 1);
const int HW0 = H0 * W0;
@ -928,7 +929,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
// spacial branch for depth-wise convolution implemented using generic convolution.
// In this case, CONV_MR is 1, and CONV_NR is the same.
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
{
size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
float *cptr0 = cbuf_task;
@ -947,12 +948,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
memcpy(cptr0, cptr, outLen * sizeof(cptr[0]));
cptr = cptr0;
}
#if CV_TRY_AVX2
if (conv->useAVX2 && outLen > CONV_NR/3)
opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
else
#endif
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen);
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
if (ifBuffer)
{
@ -980,7 +977,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
{
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
#if CV_TRY_AVX2 || CV_TRY_NEON
#if CV_TRY_AVX || CV_TRY_AVX2 || CV_NEON
// The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4.
bool runOpt = outLen > std::min(8, CONV_NR/3);
#endif
@ -992,16 +989,21 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
{
#if CV_TRY_AVX2
if (conv->useAVX2 && runOpt)
opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
opt_AVX2::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
else
#endif
#if CV_TRY_NEON
#if CV_TRY_AVX
if (conv->useAVX && runOpt)
opt_AVX::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
else
#endif
#if CV_NEON
if (conv->useNEON && runOpt)
opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
opt_NEON::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
else
#endif
// The possible outLen range is 24 or 8~1.
convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen);
convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
}
}
}
@ -1087,4 +1089,466 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
}
});
}
/****************************************************************************************\
SIMD and no-SIMD code for convBlock
\****************************************************************************************/
static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
std::vector<float> cbuffer(outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
float ai = a[p];
for( int j = 0; j < outLen; j++ )
cbuf[j] += b[convNR*p + j] * ai;
}
if (init_c)
{
for(int j = 0; j < outLen; j++)
{
c[j] += cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
else
{
for(int j = 0; j < outLen; j++)
{
c[j] = cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
}
#if CV_SIMD128
static void convBlockMR1x28(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
CV_Assert(convNR == 28);
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0;
v_float32x4 c3 = c0, c4 = c0, c5 = c0;
v_float32x4 c6 = c0;
for (int p = 0; p < np; p++, a++, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
v_float32x4 b6 = v_load(b + 24);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
c6 = v_fma(b6, a0, c6);
}
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c6 += v_load(c + 24);
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
c3 = v_min(v_max(c3, vmin), vmax);
c4 = v_min(v_max(c4, vmin), vmax);
c5 = v_min(v_max(c5, vmin), vmax);
c6 = v_min(v_max(c6, vmin), vmax);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
v_store(c + 24, c6);
}
static void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
CV_Assert(convNR == 24);
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0;
v_float32x4 c3 = c0, c4 = c0, c5 = c0;
for (int p = 0; p < np; p++, a++, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
}
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
c3 = v_min(v_max(c3, vmin), vmax);
c4 = v_min(v_max(c4, vmin), vmax);
c5 = v_min(v_max(c5, vmin), vmax);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
}
static void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
CV_Assert(convNR == 12);
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0;
for (int p = 0; p < np; p++, a++, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
}
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
}
#endif
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
{
#if CV_SIMD128
// The outLen represents the valid output value in CONV_NR length.
// When outLen is very small, we use the no-SIMD branch.
const int convNRby3 = convNR/3;
if (outLen > convNRby3)
{
if (convNR == 28)
convBlockMR1x28(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
else if (convNR == 24)
convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
else if (convNR == 12)
convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
}
else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
#else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
#endif
}
#if CV_SIMD128
static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
a0 = v_setall_f32(a[1]);
c6 = v_fma(b0, a0, c6);
c7 = v_fma(b1, a0, c7);
c8 = v_fma(b2, a0, c8);
c9 = v_fma(b3, a0, c9);
c10 = v_fma(b4, a0, c10);
c11 = v_fma(b5, a0, c11);
a0 = v_setall_f32(a[2]);
c12 = v_fma(b0, a0, c12);
c13 = v_fma(b1, a0, c13);
c14 = v_fma(b2, a0, c14);
c15 = v_fma(b3, a0, c15);
c16 = v_fma(b4, a0, c16);
c17 = v_fma(b5, a0, c17);
a0 = v_setall_f32(a[3]);
c18 = v_fma(b0, a0, c18);
c19 = v_fma(b1, a0, c19);
c20 = v_fma(b2, a0, c20);
c21 = v_fma(b3, a0, c21);
c22 = v_fma(b4, a0, c22);
c23 = v_fma(b5, a0, c23);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c6 += v_load(c + ldc);
c7 += v_load(c + ldc + 4);
c8 += v_load(c + ldc + 8);
c9 += v_load(c + ldc + 12);
c10 += v_load(c + ldc + 16);
c11 += v_load(c + ldc + 20);
c12 += v_load(c + ldc*2);
c13 += v_load(c + ldc*2 + 4);
c14 += v_load(c + ldc*2 + 8);
c15 += v_load(c + ldc*2 + 12);
c16 += v_load(c + ldc*2 + 16);
c17 += v_load(c + ldc*2 + 20);
c18 += v_load(c + ldc*3);
c19 += v_load(c + ldc*3 + 4);
c20 += v_load(c + ldc*3 + 8);
c21 += v_load(c + ldc*3 + 12);
c22 += v_load(c + ldc*3 + 16);
c23 += v_load(c + ldc*3 + 20);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
v_store(c + ldc, c6);
v_store(c + ldc + 4, c7);
v_store(c + ldc + 8, c8);
v_store(c + ldc + 12, c9);
v_store(c + ldc + 16, c10);
v_store(c + ldc + 20, c11);
v_store(c + ldc * 2, c12);
v_store(c + ldc * 2 + 4, c13);
v_store(c + ldc * 2 + 8, c14);
v_store(c + ldc * 2 + 12, c15);
v_store(c + ldc * 2 + 16, c16);
v_store(c + ldc * 2 + 20, c17);
v_store(c + ldc * 3, c18);
v_store(c + ldc * 3 + 4, c19);
v_store(c + ldc * 3 + 8, c20);
v_store(c + ldc * 3 + 12, c21);
v_store(c + ldc * 3 + 16, c22);
v_store(c + ldc * 3 + 20, c23);
}
static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
CV_Assert(convNR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b0, a1, c2);
c3 = v_fma(b1, a1, c3);
c4 = v_fma(b0, a2, c4);
c5 = v_fma(b1, a2, c5);
c6 = v_fma(b0, a3, c6);
c7 = v_fma(b1, a3, c7);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + ldc);
c3 += v_load(c + ldc + 4);
c4 += v_load(c + ldc*2);
c5 += v_load(c + ldc*2 + 4);
c6 += v_load(c + ldc*3);
c7 += v_load(c + ldc*3 + 4);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + ldc, c2);
v_store(c + ldc + 4, c3);
v_store(c + ldc * 2, c4);
v_store(c + ldc * 2 + 4, c5);
v_store(c + ldc * 3, c6);
v_store(c + ldc * 3 + 4, c7);
}
static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
CV_Assert(convNR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
v_float32x4 b0 = v_load(b);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b0, a1, c1);
c2 = v_fma(b0, a2, c2);
c3 = v_fma(b0, a3, c3);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + ldc);
c2 += v_load(c + ldc*2);
c3 += v_load(c + ldc*3);
}
v_store(c, c0);
v_store(c + ldc, c1);
v_store(c + ldc * 2, c2);
v_store(c + ldc * 3, c3);
}
#endif
static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
const int convMR, const int convNR)
{
std::vector<float> cbuffer(convMR * outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
for( int i = 0; i < convMR; i++ )
{
float ai = a[convMR*p + i];
for( int j = 0; j < outLen; j++ )
cbuf[i * outLen+j] += b[convNR*p + j] * ai;
}
}
if (!init_c)
{
for(int i = 0; i < convMR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] += cbuf[i*outLen + j];
}
}
else
{
for(int i = 0; i < convMR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] = cbuf[i*outLen + j];
}
}
}
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
const int convMR, const int convNR)
{
// The possible outLen range is [24, 8~1].
#if CV_SIMD128
CV_Assert(convMR == 4);
if (outLen > 8 && convNR == 24)
{
convBlock4x24(np, a, b, c, ldc, init_c, convMR, convNR);
return;
}
if (outLen <= 8 && outLen > 4)
{
convBlock4x8(np, a, b, c, ldc, init_c, convMR, convNR);
return;
}
if (outLen <= 4 && outLen > 1)
{
convBlock4x4(np, a, b, c, ldc, init_c, convMR, convNR);
return;
}
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
#else
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
#endif
}
}} // namespace cv::dnn

@ -22,27 +22,29 @@
// Winograd Params
enum {
_FX_WINO_STEP=6,
_FX_WINO_KSIZE=3,
_FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
_FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
CONV_WINO_STEP=6,
CONV_WINO_KSIZE=3,
CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE-1, // 8
CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE,
_FX_WINO_KBLOCK = 4,
CONV_WINO_KBLOCK = 4,
#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
_FX_WINO_IBLOCK = 6,
CONV_WINO_IBLOCK = 6,
#else
_FX_WINO_IBLOCK = 3,
CONV_WINO_IBLOCK = 3,
#endif
#if CV_TRY_AVX2
_FX_WINO_ATOM_F32 = 8,
CONV_WINO_ATOM_F32 = 8,
#else
_FX_WINO_ATOM_F32 = 4,
CONV_WINO_ATOM_F32 = 4,
#endif
_FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
};
enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2, _FX_CONV_TYPE_DEPTHWISE_REMAIN=3 };
// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 };
enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 };
#endif
@ -105,22 +107,6 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
namespace opt_AVX2
{
#if CV_TRY_AVX2
void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c);
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, const float minval,
const float maxval, bool ifMinMaxAct);
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock);
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg);
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
#endif
} // namespace opt_AVX2
} // namespace dnn
} // namespace cv

@ -1,499 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../../precomp.hpp"
#include "fast_convolution.hpp"
namespace cv {
namespace dnn {
namespace opt_AVX2
{
#if CV_TRY_AVX2
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct)
{
#if CONV_NR == 24
__m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0;
for (int p = 0; p < np; p++, a++, b += CONV_NR)
{
__m256 a0 = _mm256_set1_ps(a[0]);
__m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16);
c0 = _mm256_fmadd_ps(b0, a0, c0);
c1 = _mm256_fmadd_ps(b1, a0, c1);
c2 = _mm256_fmadd_ps(b2, a0, c2);
}
if (init_c)
{
c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0);
c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1);
c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2);
}
if (ifMinMaxAct)
{
__m256 vmax = _mm256_set1_ps(maxval);
__m256 vmin = _mm256_set1_ps(minval);
c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax);
c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax);
c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax);
}
_mm256_storeu_ps(c, c0);
_mm256_storeu_ps(c + 8, c1);
_mm256_storeu_ps(c + 16, c2);
_mm256_zeroupper();
#else
#error "unsupported CONV_NR in convBlockMR1."
#endif
}
void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
#if CONV_MR == 4 && CONV_NR == 24
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
__m256 c10 = c00, c11 = c00, c12 = c00;
__m256 c20 = c00, c21 = c00, c22 = c00;
__m256 c30 = c00, c31 = c00, c32 = c00;
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
c00 = _mm256_fmadd_ps(b0, a0, c00);
c01 = _mm256_fmadd_ps(b1, a0, c01);
c02 = _mm256_fmadd_ps(b2, a0, c02);
c10 = _mm256_fmadd_ps(b0, a1, c10);
c11 = _mm256_fmadd_ps(b1, a1, c11);
c12 = _mm256_fmadd_ps(b2, a1, c12);
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
c20 = _mm256_fmadd_ps(b0, a0, c20);
c21 = _mm256_fmadd_ps(b1, a0, c21);
c22 = _mm256_fmadd_ps(b2, a0, c22);
c30 = _mm256_fmadd_ps(b0, a1, c30);
c31 = _mm256_fmadd_ps(b1, a1, c31);
c32 = _mm256_fmadd_ps(b2, a1, c32);
}
if (!init_c)
{
c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
}
_mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
_mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
_mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
_mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
_mm256_zeroupper();
#else
#error "unsupported CONV_MR and/or CONV_NR in convBlock_AVX2."
#endif
}
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
float* outbuf, int Cg, int iblock)
{
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 8);
if (iblock > 3)
{
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
outbuf += _FX_WINO_ATOM_F32)
{
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32)
{
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
__m256 x0, x1;
x0 = _mm256_load_ps(inwptr);
x1 = _mm256_load_ps(inwptr + 8);
s00 = _mm256_fmadd_ps(w0, x0, s00);
s01 = _mm256_fmadd_ps(w0, x1, s01);
s10 = _mm256_fmadd_ps(w1, x0, s10);
s11 = _mm256_fmadd_ps(w1, x1, s11);
s20 = _mm256_fmadd_ps(w2, x0, s20);
s21 = _mm256_fmadd_ps(w2, x1, s21);
s30 = _mm256_fmadd_ps(w3, x0, s30);
s31 = _mm256_fmadd_ps(w3, x1, s31);
x0 = _mm256_load_ps(inwptr + 16);
x1 = _mm256_load_ps(inwptr + 24);
s02 = _mm256_fmadd_ps(w0, x0, s02);
s03 = _mm256_fmadd_ps(w0, x1, s03);
s12 = _mm256_fmadd_ps(w1, x0, s12);
s13 = _mm256_fmadd_ps(w1, x1, s13);
s22 = _mm256_fmadd_ps(w2, x0, s22);
s23 = _mm256_fmadd_ps(w2, x1, s23);
s32 = _mm256_fmadd_ps(w3, x0, s32);
s33 = _mm256_fmadd_ps(w3, x1, s33);
x0 = _mm256_load_ps(inwptr + 32);
x1 = _mm256_load_ps(inwptr + 40);
s04 = _mm256_fmadd_ps(w0, x0, s04);
s05 = _mm256_fmadd_ps(w0, x1, s05);
s14 = _mm256_fmadd_ps(w1, x0, s14);
s15 = _mm256_fmadd_ps(w1, x1, s15);
s24 = _mm256_fmadd_ps(w2, x0, s24);
s25 = _mm256_fmadd_ps(w2, x1, s25);
s34 = _mm256_fmadd_ps(w3, x0, s34);
s35 = _mm256_fmadd_ps(w3, x1, s35);
}
_mm256_store_ps(outbuf, s00);
_mm256_store_ps(outbuf + 1*64, s01);
_mm256_store_ps(outbuf + 2*64, s02);
_mm256_store_ps(outbuf + 3*64, s03);
_mm256_store_ps(outbuf + 4*64, s04);
_mm256_store_ps(outbuf + 5*64, s05);
_mm256_store_ps(outbuf + 6*64, s10);
_mm256_store_ps(outbuf + 7*64, s11);
_mm256_store_ps(outbuf + 8*64, s12);
_mm256_store_ps(outbuf + 9*64, s13);
_mm256_store_ps(outbuf + 10*64, s14);
_mm256_store_ps(outbuf + 11*64, s15);
_mm256_store_ps(outbuf + 12*64, s20);
_mm256_store_ps(outbuf + 13*64, s21);
_mm256_store_ps(outbuf + 14*64, s22);
_mm256_store_ps(outbuf + 15*64, s23);
_mm256_store_ps(outbuf + 16*64, s24);
_mm256_store_ps(outbuf + 17*64, s25);
_mm256_store_ps(outbuf + 18*64, s30);
_mm256_store_ps(outbuf + 19*64, s31);
_mm256_store_ps(outbuf + 20*64, s32);
_mm256_store_ps(outbuf + 21*64, s33);
_mm256_store_ps(outbuf + 22*64, s34);
_mm256_store_ps(outbuf + 23*64, s35);
}
}
else
{
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
outbuf += _FX_WINO_ATOM_F32)
{
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00;
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00;
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00;
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00;
for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) {
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
__m256 x0, x1, x2;
x0 = _mm256_load_ps(inwptr);
x1 = _mm256_load_ps(inwptr + 8);
x2 = _mm256_load_ps(inwptr + 16);
s00 = _mm256_fmadd_ps(w0, x0, s00);
s01 = _mm256_fmadd_ps(w0, x1, s01);
s02 = _mm256_fmadd_ps(w0, x2, s02);
s10 = _mm256_fmadd_ps(w1, x0, s10);
s11 = _mm256_fmadd_ps(w1, x1, s11);
s12 = _mm256_fmadd_ps(w1, x2, s12);
s20 = _mm256_fmadd_ps(w2, x0, s20);
s21 = _mm256_fmadd_ps(w2, x1, s21);
s22 = _mm256_fmadd_ps(w2, x2, s22);
s30 = _mm256_fmadd_ps(w3, x0, s30);
s31 = _mm256_fmadd_ps(w3, x1, s31);
s32 = _mm256_fmadd_ps(w3, x2, s32);
}
_mm256_store_ps(outbuf, s00);
_mm256_store_ps(outbuf + 1*64, s01);
_mm256_store_ps(outbuf + 2*64, s02);
_mm256_store_ps(outbuf + 6*64, s10);
_mm256_store_ps(outbuf + 7*64, s11);
_mm256_store_ps(outbuf + 8*64, s12);
_mm256_store_ps(outbuf + 12*64, s20);
_mm256_store_ps(outbuf + 13*64, s21);
_mm256_store_ps(outbuf + 14*64, s22);
_mm256_store_ps(outbuf + 18*64, s30);
_mm256_store_ps(outbuf + 19*64, s31);
_mm256_store_ps(outbuf + 20*64, s32);
}
}
_mm256_zeroupper();
}
static inline
void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7)
{
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
__t0 = _mm256_unpacklo_ps(row0, row1);
__t1 = _mm256_unpackhi_ps(row0, row1);
__t2 = _mm256_unpacklo_ps(row2, row3);
__t3 = _mm256_unpackhi_ps(row2, row3);
__t4 = _mm256_unpacklo_ps(row4, row5);
__t5 = _mm256_unpackhi_ps(row4, row5);
__t6 = _mm256_unpacklo_ps(row6, row7);
__t7 = _mm256_unpackhi_ps(row6, row7);
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}
/*Input transform*/
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg)
{
__m256 x00 = _mm256_loadu_ps(inptr);
__m256 x10 = _mm256_loadu_ps(inptr + inpstep);
__m256 x20 = _mm256_loadu_ps(inptr + inpstep*2);
__m256 x30 = _mm256_loadu_ps(inptr + inpstep*3);
__m256 x40 = _mm256_loadu_ps(inptr + inpstep*4);
__m256 x50 = _mm256_loadu_ps(inptr + inpstep*5);
__m256 x60 = _mm256_loadu_ps(inptr + inpstep*6);
__m256 x70 = _mm256_loadu_ps(inptr + inpstep*7);
__m256 z00, z10, z20, z30, z40, z50, z60, z70;
{
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
__m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10;
t00 = _mm256_sub_ps(x40, x20);
t10 = _mm256_sub_ps(x30, x50);
__m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60));
__m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10));
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
__m256 qm4_25 = _mm256_set1_ps(-4.25f);
t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50));
t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60));
__m256 y10 = _mm256_add_ps(t00, t10);
__m256 y20 = _mm256_sub_ps(t10, t00);
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
__m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f);
__m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f);
t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50));
t10 = _mm256_fmadd_ps(x20, q0_25, x60);
t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
t10 = _mm256_fmadd_ps(x40, qm1_25, t10);
__m256 y30 = _mm256_add_ps(t00, t10);
__m256 y40 = _mm256_sub_ps(t10, t00);
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
__m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f);
t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10));
t10 = _mm256_fmadd_ps(x20, q4 , x60);
t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
t10 = _mm256_fmadd_ps(x40, qm5 , t10);
__m256 y50 = _mm256_add_ps(t00, t10);
__m256 y60 = _mm256_sub_ps(t10, t00);
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = _mm256_sub_ps(y40, y20);
t10 = _mm256_sub_ps(y30, y50);
z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60));
z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10));
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50));
t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60));
z10 = _mm256_add_ps(t00, t10);
z20 = _mm256_sub_ps(t10, t00);
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50));
t10 = _mm256_fmadd_ps(y20, q0_25, y60);
t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
t10 = _mm256_fmadd_ps(y40, qm1_25, t10);
z30 = _mm256_add_ps(t00, t10);
z40 = _mm256_sub_ps(t10, t00);
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10));
t10 = _mm256_fmadd_ps(y20, q4, y60);
t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
t10 = _mm256_fmadd_ps(y40, qm5, t10);
z50 = _mm256_add_ps(t00, t10);
z60 = _mm256_sub_ps(t10, t00);
}
const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg;
_mm256_storeu_ps(outptr, z00);
_mm256_storeu_ps(outptr + outstep, z10);
_mm256_storeu_ps(outptr + outstep*2, z20);
_mm256_storeu_ps(outptr + outstep*3, z30);
_mm256_storeu_ps(outptr + outstep*4, z40);
_mm256_storeu_ps(outptr + outstep*5, z50);
_mm256_storeu_ps(outptr + outstep*6, z60);
_mm256_storeu_ps(outptr + outstep*7, z70);
_mm256_zeroupper();
}
#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \
lowM = _mm256_castps256_ps128(z00); \
highM = _mm256_extractf128_ps(z00, 1); \
_mm_storeu_ps(ptr, lowM); \
_mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM))
/* Inverse Winograd 8x8 transform:
out = (A'*inp*A)', where
inp is input 8x8 FP32 matrix,
A' is
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
*/
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
__m256 x00 = _mm256_load_ps(inptr);
__m256 x10 = _mm256_load_ps(inptr + inpstep);
__m256 x20 = _mm256_load_ps(inptr + inpstep*2);
__m256 x30 = _mm256_load_ps(inptr + inpstep*3);
__m256 x40 = _mm256_load_ps(inptr + inpstep*4);
__m256 x50 = _mm256_load_ps(inptr + inpstep*5);
__m256 x60 = _mm256_load_ps(inptr + inpstep*6);
__m256 x70 = _mm256_load_ps(inptr + inpstep*7);
__m256 z00, z10, z20, z30, z40, z50;
{
__m256 s12_0, s34_0, s56_0;
s12_0 = _mm256_add_ps(x10, x20);
s34_0 = _mm256_add_ps(x30, x40);
s56_0 = _mm256_add_ps(x50, x60);
__m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
__m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
__m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
s12_0 = _mm256_sub_ps(x10, x20);
s34_0 = _mm256_sub_ps(x30, x40);
s56_0 = _mm256_sub_ps(x50, x60);
__m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0)));
__m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0));
__m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0));
__m256 y60 = _mm256_set1_ps(0.f), y70 = y60;
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
s12_0 = _mm256_add_ps(y10, y20);
s34_0 = _mm256_add_ps(y30, y40);
s56_0 = _mm256_add_ps(y50, y60);
z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
s12_0 = _mm256_sub_ps(y10, y20);
s34_0 = _mm256_sub_ps(y30, y40);
s56_0 = _mm256_sub_ps(y50, y60);
z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0)));
z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0));
z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0));
__m256 vbias = _mm256_set1_ps(bias);
z00 = _mm256_add_ps(vbias, z00);
z10 = _mm256_add_ps(vbias, z10);
z20 = _mm256_add_ps(vbias, z20);
z30 = _mm256_add_ps(vbias, z30);
z40 = _mm256_add_ps(vbias, z40);
z50 = _mm256_add_ps(vbias, z50);
}
if (bpptr)
{
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep));
z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2));
z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3));
z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4));
z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5));
}
if (ifMinMaxAct)
{
__m256 vmax = _mm256_set1_ps(maxval);
__m256 vmin = _mm256_set1_ps(minval);
z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax);
z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax);
z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax);
z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax);
z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax);
z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax);
}
__m128 lowM, highM;
STORE6_ELE_FROM_16(outptr, z00, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM);
STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM);
_mm256_zeroupper();
}
#endif
} // namespace opt_AVX2
} // namespace dnn
} // namespace cv

@ -1,567 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP
#define OPENCV_FAST_CONVOLUTION_SIMD_HPP
#include "opencv2/core/hal/intrin.hpp"
#include <opencv2/core/utils/logger.hpp>
namespace cv {
namespace dnn {
static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen)
{
std::vector<float> cbuffer(outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
float ai = a[p];
for( int j = 0; j < outLen; j++ )
cbuf[j] += b[CONV_NR*p + j] * ai;
}
if (init_c)
{
for(int j = 0; j < outLen; j++)
{
c[j] += cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
else
{
for(int j = 0; j < outLen; j++)
{
c[j] = cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
}
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen)
{
#if CV_SIMD128
// The outLen represents the valid output value in CONV_NR length.
// When outLen is very small, we use the no-SIMD branch.
const int CONV_NRby3 = CONV_NR/3;
if (outLen > CONV_NRby3)
{
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; // CONV_NR == 12
#if CONV_NR == 28 || CONV_NR == 24
v_float32x4 c3 = c0, c4 = c0, c5 = c0;
#endif
#if CONV_NR == 28
v_float32x4 c6 = c0;
#endif
for (int p = 0; p < np; p++, a++, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
#if CONV_NR == 28 || CONV_NR == 24
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
#endif
#if CONV_NR == 28
v_float32x4 b6 = v_load(b + 24);
#endif
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
#if CONV_NR == 28 || CONV_NR == 24
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
#endif
#if CONV_NR == 28
c6 = v_fma(b6, a0, c6);
#endif
}
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
#if CONV_NR == 28 || CONV_NR == 24
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
#endif
#if CONV_NR == 28
c6 += v_load(c + 24);
#endif
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
#if CONV_NR == 28 || CONV_NR == 24
c3 = v_min(v_max(c3, vmin), vmax);
c4 = v_min(v_max(c4, vmin), vmax);
c5 = v_min(v_max(c5, vmin), vmax);
#endif
#if CONV_NR == 28
c6 = v_min(v_max(c6, vmin), vmax);
#endif
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
#if CONV_NR == 28 || CONV_NR == 24
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
#endif
#if CONV_NR == 28
v_store(c + 24, c6);
#endif
}
else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen);
#else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen);
#endif
}
#if CV_SIMD128
#if CONV_MR == 4 && CONV_NR == 24
static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
a0 = v_setall_f32(a[1]);
c6 = v_fma(b0, a0, c6);
c7 = v_fma(b1, a0, c7);
c8 = v_fma(b2, a0, c8);
c9 = v_fma(b3, a0, c9);
c10 = v_fma(b4, a0, c10);
c11 = v_fma(b5, a0, c11);
a0 = v_setall_f32(a[2]);
c12 = v_fma(b0, a0, c12);
c13 = v_fma(b1, a0, c13);
c14 = v_fma(b2, a0, c14);
c15 = v_fma(b3, a0, c15);
c16 = v_fma(b4, a0, c16);
c17 = v_fma(b5, a0, c17);
a0 = v_setall_f32(a[3]);
c18 = v_fma(b0, a0, c18);
c19 = v_fma(b1, a0, c19);
c20 = v_fma(b2, a0, c20);
c21 = v_fma(b3, a0, c21);
c22 = v_fma(b4, a0, c22);
c23 = v_fma(b5, a0, c23);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c6 += v_load(c + ldc);
c7 += v_load(c + ldc + 4);
c8 += v_load(c + ldc + 8);
c9 += v_load(c + ldc + 12);
c10 += v_load(c + ldc + 16);
c11 += v_load(c + ldc + 20);
c12 += v_load(c + ldc*2);
c13 += v_load(c + ldc*2 + 4);
c14 += v_load(c + ldc*2 + 8);
c15 += v_load(c + ldc*2 + 12);
c16 += v_load(c + ldc*2 + 16);
c17 += v_load(c + ldc*2 + 20);
c18 += v_load(c + ldc*3);
c19 += v_load(c + ldc*3 + 4);
c20 += v_load(c + ldc*3 + 8);
c21 += v_load(c + ldc*3 + 12);
c22 += v_load(c + ldc*3 + 16);
c23 += v_load(c + ldc*3 + 20);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
v_store(c + ldc, c6);
v_store(c + ldc + 4, c7);
v_store(c + ldc + 8, c8);
v_store(c + ldc + 12, c9);
v_store(c + ldc + 16, c10);
v_store(c + ldc + 20, c11);
v_store(c + ldc * 2, c12);
v_store(c + ldc * 2 + 4, c13);
v_store(c + ldc * 2 + 8, c14);
v_store(c + ldc * 2 + 12, c15);
v_store(c + ldc * 2 + 16, c16);
v_store(c + ldc * 2 + 20, c17);
v_store(c + ldc * 3, c18);
v_store(c + ldc * 3 + 4, c19);
v_store(c + ldc * 3 + 8, c20);
v_store(c + ldc * 3 + 12, c21);
v_store(c + ldc * 3 + 16, c22);
v_store(c + ldc * 3 + 20, c23);
}
#endif
static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
CV_Assert(CONV_NR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b0, a1, c2);
c3 = v_fma(b1, a1, c3);
c4 = v_fma(b0, a2, c4);
c5 = v_fma(b1, a2, c5);
c6 = v_fma(b0, a3, c6);
c7 = v_fma(b1, a3, c7);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + ldc);
c3 += v_load(c + ldc + 4);
c4 += v_load(c + ldc*2);
c5 += v_load(c + ldc*2 + 4);
c6 += v_load(c + ldc*3);
c7 += v_load(c + ldc*3 + 4);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + ldc, c2);
v_store(c + ldc + 4, c3);
v_store(c + ldc * 2, c4);
v_store(c + ldc * 2 + 4, c5);
v_store(c + ldc * 3, c6);
v_store(c + ldc * 3 + 4, c7);
}
static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
CV_Assert(CONV_NR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
v_float32x4 b0 = v_load(b);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b0, a1, c1);
c2 = v_fma(b0, a2, c2);
c3 = v_fma(b0, a3, c3);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + ldc);
c2 += v_load(c + ldc*2);
c3 += v_load(c + ldc*3);
}
v_store(c, c0);
v_store(c + ldc, c1);
v_store(c + ldc * 2, c2);
v_store(c + ldc * 3, c3);
}
#endif
static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen)
{
std::vector<float> cbuffer(CONV_MR * outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
for( int i = 0; i < CONV_MR; i++ )
{
float ai = a[CONV_MR*p + i];
for( int j = 0; j < outLen; j++ )
cbuf[i * outLen+j] += b[CONV_NR*p + j] * ai;
}
}
if (!init_c)
{
for(int i = 0; i < CONV_MR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] += cbuf[i*outLen + j];
}
}
else
{
for(int i = 0; i < CONV_MR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] = cbuf[i*outLen + j];
}
}
}
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen)
{
// The possible outLen range is [24, 8~1].
#if CV_SIMD128
#if CONV_MR == 4 && CONV_NR == 24
const int CONV_NRby3 = CONV_NR/3;
if (outLen > CONV_NRby3)
{
convBlock4x24(np, a, b, c, ldc, init_c);
return;
}
#endif
if (outLen <= 8 && outLen > 4)
{
convBlock4x8(np, a, b, c, ldc, init_c);
return;
}
if (outLen <= 4 && outLen > 1)
{
convBlock4x4(np, a, b, c, ldc, init_c);
return;
}
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen);
#else
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen);
#endif
}
} // namespace dnn
namespace opt_NEON
{
#if CV_TRY_NEON
void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
#if CONV_MR == 4 && CONV_NR == 28 // AARCH64
{
float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00;
float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10;
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
for( int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR )
{
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
b0 = vld1q_f32(b + 24);
c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
}
if (!init_c)
{
c00 = vaddq_f32(c00, vld1q_f32(c));
c01 = vaddq_f32(c01, vld1q_f32(c + 4));
c02 = vaddq_f32(c02, vld1q_f32(c + 8));
c03 = vaddq_f32(c03, vld1q_f32(c + 12));
c04 = vaddq_f32(c04, vld1q_f32(c + 16));
c05 = vaddq_f32(c05, vld1q_f32(c + 20));
c06 = vaddq_f32(c06, vld1q_f32(c + 24));
c10 = vaddq_f32(c10, vld1q_f32(c + ldc));
c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4));
c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8));
c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12));
c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16));
c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20));
c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24));
c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2));
c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4));
c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8));
c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12));
c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16));
c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20));
c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24));
c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3));
c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4));
c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8));
c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12));
c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16));
c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20));
c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24));
}
vst1q_f32(c, c00); vst1q_f32(c+4, c01);
vst1q_f32(c+8, c02); vst1q_f32(c+12, c03);
vst1q_f32(c+16, c04); vst1q_f32(c+20, c05);
vst1q_f32(c+24, c06);
vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11);
vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13);
vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15);
vst1q_f32(c+ldc+24, c16);
vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21);
vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23);
vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25);
vst1q_f32(c+ldc*2+24, c26);
vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31);
vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33);
vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35);
vst1q_f32(c+ldc*3+24, c36);
}
#elif CONV_MR == 4 && CONV_NR == 12 // ARMv7
{
float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
c1 = vmlaq_lane_f32(c1, b1, a0, 0);
c2 = vmlaq_lane_f32(c2, b2, a0, 0);
c3 = vmlaq_lane_f32(c3, b0, a0, 1);
c4 = vmlaq_lane_f32(c4, b1, a0, 1);
c5 = vmlaq_lane_f32(c5, b2, a0, 1);
c6 = vmlaq_lane_f32(c6, b0, a1, 0);
c7 = vmlaq_lane_f32(c7, b1, a1, 0);
c8 = vmlaq_lane_f32(c8, b2, a1, 0);
c9 = vmlaq_lane_f32(c9 , b0, a1, 1);
c10 = vmlaq_lane_f32(c10, b1, a1, 1);
c11 = vmlaq_lane_f32(c11, b2, a1, 1);
}
if (!init_c)
{
c0 = vaddq_f32(c0, vld1q_f32(c));
c1 = vaddq_f32(c1, vld1q_f32(c + 4));
c2 = vaddq_f32(c2, vld1q_f32(c + 8));
c3 = vaddq_f32(c3, vld1q_f32(c + ldc));
c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4));
c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8));
c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2));
c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4));
c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8));
c9 = vaddq_f32(c9 , vld1q_f32(c + ldc * 3));
c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4));
c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8));
}
vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2);
vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5);
vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
}
//#else
//#error "unsupported CONV_MR and/or CONV_NR in convBlock_NEON."
#endif
}
#endif
} // namespace opt_NEON
} // namespace cv
#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP

@ -46,16 +46,6 @@ namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void fastDepthwiseConv( const float* weights,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* bias, const float* relu,
const float* inptr,
int height, int width,
float* outptr,
int out_d, int outH, int outW );
void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize );
@ -70,185 +60,6 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = _mm256_loadu_ps(ptr);
__m256 t1 = _mm256_loadu_ps(ptr + 8);
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
a = _mm256_shuffle_ps(lo, hi, 0x88);
b = _mm256_shuffle_ps(lo, hi, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
v10 = _mm256_loadu_ps(imgptr1 + in_j),
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
v20 = _mm256_loadu_ps(imgptr2 + in_j),
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
_mm256_zeroupper();
}
// Used to generate the mask used when calculating tails
static const uint32_t tailMaskArray[15] = {
0, 0, 0, 0, 0, 0, 0, 0,
@ -654,382 +465,10 @@ void fastGEMM1T( const float* vec, const float* weights,
}
}
/*
Example for load_deinterleave:
input: ptr[16] = {1,2,3, ... ,14,15,16}
output: a = {1, 3, 5, 7, 9, 11, 13, 15}
output: b = {2, 4, 6, 8,10, 12, 14, 16}
*/
static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl)
{
vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2);
vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask);
vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2);
vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2);
vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4();
vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2);
tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2);
tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2);
/* The following instructions have not to be supported by the GNU toolchain.
So we temporarily use store and load instead.
// a = vlmul_trunc_v_f32m4_f32m2(tempa);
// b = vlmul_trunc_v_f32m4_f32m2(tempb);
*/
cv::AutoBuffer<float> cvBuffer(sizeof(float)*vl*2);
float* buffer = (float*)cvBuffer.data();
vse32_v_f32m4(buffer, tempa, vl);
a = vle32_v_f32m2(buffer, vl);
vse32_v_f32m4(buffer, tempb, vl);
b = vle32_v_f32m2(buffer, vl);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
int vl;
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
int avl = outW1 - out_j;
if( stride_w == 1 )
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl),
v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl),
v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl),
v10 = vle32_v_f32m2(imgptr1 + in_j, vl),
v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl),
v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl),
v20 = vle32_v_f32m2(imgptr2 + in_j, vl),
v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl),
v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl);
vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
vout0 = vfadd_vf_f32m2(vout0, bias, vl);
vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
if (relu)
{
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
}
vse32_v_f32m2(outptr + out_j, vout0, vl);
}
else //stride_w == 2 && dilation_w == 1
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl);
vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl);
vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl);
vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl);
vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl);
vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl);
vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
vout0 = vfadd_vf_f32m2(vout0, bias, vl);
vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
if (relu)
{
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
}
vse32_v_f32m2(outptr + out_j, vout0, vl);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
#endif // CV_RVV
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = (__m256)__lasx_xvld(ptr, 0);
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
// dst = vec * weights^t + bias
void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,

Loading…
Cancel
Save