Merge pull request #23192 from zihaomu:clean_up_SIMD_code
### Purpose of this PR: - Move all dispatch and SIMD code of `convolution layer` into `simd.hpp` file. - Support Winograd at AVX-only machine. - Re-name the folder from `fast_conv` to `cpu_kernels`. In the future, we can put other layers of CPU optimization into it, like `GEMM` or `MatMul`. ## Performance Test Since this patch just focuses on the code style, the performance is expected as the same as before. Test with the following script: `./bin/opencv_perf_dnn '--gtest_filter=*conv*' --gtest_output="xml:../1-0th.xml" --perf_threads=1` ### Test on X86 platform Min (ms) |Name of Test|4.x | patch | 4.x vs patch (x-factor)| |---|:-:|:-:|:-:| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|0.98| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|0.95| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.97| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.002|1.04| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.002|0.002|0.94| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.040|0.044|0.93| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.010|0.010|1.00| |conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.106|0.103|1.03| |conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.041|0.040|1.03| |conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.340|0.329|1.03| |conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.590|0.567|1.04| |conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.374|1.314|1.05| |conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.715|3.528|1.05| |conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.181|1.166|1.01| |conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.689|2.587|1.04| |conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.754|4.500|1.06| |conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.612|9.112|1.05| |conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.000|64.676|1.07| |conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|20.248|18.451|1.10| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.395|1.392|1.00| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|1.990|1.984|1.00| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.393|1.360|1.02| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.813|1.744|1.04| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.190|1.191|1.00| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.286|1.284|1.00| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.295|2.279|1.01| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.322|1.331|0.99| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.784|3.533|1.07| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.838|1.844|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.957|1.959|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.596|2.573|1.01| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.183|4.083|1.02| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.413|2.406|1.00| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.538|2.546|1.00| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.972|2.980|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.452|3.464|1.00| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.082|3.105|0.99| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.043|3.919|1.03| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.538|5.531|1.00| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.393|3.418|0.99| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.325|4.234|1.02| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.009|5.908|1.02| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.557|6.376|1.03| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|10.114|9.472|1.07| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.373|9.879|1.05| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.782|11.624|1.10| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|90.931|90.552|1.00| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|6.091|5.818|1.05| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|7.083|6.643|1.07| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.054|5.059|1.00| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|5.005|4.931|1.02| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|4.951|5.065|0.98| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.957|11.293|1.06| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.328|5.250|1.01| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.544|5.292|1.05| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.186|5.893|1.05| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|6.153|5.834|1.05| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.154|8.107|1.01| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.699|12.256|1.04| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|11.355|11.217|1.01| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.062|17.814|1.07| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.820|6.531|1.04| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.502|13.483|1.08| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.270|6.123|1.02| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|13.173|12.451|1.06| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.326|7.652|1.09| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.605|16.465|1.07| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.675|14.771|1.06| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.420|0.423|0.99| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.788|6.491|1.05| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.456|6.168|1.05| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.263|0.261|1.01| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.690|7.398|1.04| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.200|0.202|0.99| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.542|10.464|1.01| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.876|10.728|1.01| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.194|6.768|1.06| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|7.099|6.731|1.05| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.147|0.162|0.91| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|18.558|17.141|1.08| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.641|7.219|1.06| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.666|20.999|1.08| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.523|7.921|1.08| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.514|8.109|1.05| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.300|7.878|1.05| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.403|13.131|1.02| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.920|8.357|1.07| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|28.827|27.616|1.04| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|12.895|12.670|1.02| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.120|13.078|1.08| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.541|27.582|1.00| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.367|31.140|1.04| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|14.934|14.910|1.00| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.289|18.491|0.99| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|37.857|36.845|1.03| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.402|36.566|1.02| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.031|19.164|0.99| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.019|19.135|0.99| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.077|19.400|1.03| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.883|21.302|1.03| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.288|49.851|1.03| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|27.349|28.359|0.96| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|24.915|25.130|0.99| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.488|25.899|0.98| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.346|27.390|1.00| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.033|28.301|0.99| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|50.216|49.970|1.00| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|29.670|29.513|1.01| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|50.565|49.634|1.02| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|37.900|37.814|1.00| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|41.367|39.742|1.04| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|49.128|50.350|0.98| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|79.643|80.645|0.99| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.439|40.895|1.01| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.504|46.220|1.01| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.086|96.842|1.01| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|102.447|97.299|1.05| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|145.047|144.996|1.00| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|206.104|195.543|1.05| ### Test on M1(ARM) platform |Name of Test|4.x|patch|4.x vs patch (x-factor)| |---|:-:|:-:|:-:| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|0.97| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|0.94| |conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.92| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.003|0.003|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.031|0.031|1.00| |conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.009|0.009|1.00| |conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.066|1.01| |conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.102|0.102|1.00| |conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.328|1.00| |conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.693|0.747|0.93| |conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.268|1.266|1.00| |conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.530|3.581|0.99| |conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.186|1.188|1.00| |conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.682|2.683|1.00| |conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.490|4.501|1.00| |conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.914|8.938|1.00| |conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.819|69.876|1.00| |conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|24.058|22.420|1.07| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.240|2.236|1.00| |conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.132|3.136|1.00| |conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.920|1.919|1.00| |conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.343|2.346|1.00| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.234|1.116|1.11| |conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.109|1.121|0.99| |conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.197|3.084|1.04| |conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.123|1.148|0.98| |conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.836|5.061|0.96| |conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.535|1.463|1.05| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.756|1.584|1.11| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.821|1.820|1.00| |conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|7.049|6.672|1.06| |conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.967|1.922|1.02| |conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.943|1.977|0.98| |conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.464|2.310|1.07| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.860|2.904|0.98| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.428|2.483|0.98| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|2.955|2.983|0.99| |conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.328|4.484|0.97| |conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.712|2.778|0.98| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.205|3.331|0.96| |conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|4.193|4.412|0.95| |conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|5.026|4.565|1.10| |conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.490|14.213|1.02| |conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|14.886|14.003|1.06| |conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.923|15.184|1.05| |conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|45.136|41.696|1.08| |conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.995|4.631|1.08| |conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.402|6.261|1.02| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|4.478|3.965|1.13| |conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.908|3.978|0.98| |conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|4.176|4.206|0.99| |conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|21.509|21.136|1.02| |conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|4.426|4.082|1.08| |conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.098|4.289|0.96| |conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.646|5.105|0.91| |conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.746|4.724|1.00| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|5.614|5.779|0.97| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|21.909|20.718|1.06| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.256|8.290|1.00| |conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|25.196|23.267|1.08| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.721|5.172|1.11| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|20.066|18.322|1.10| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.448|4.542|0.98| |conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.193|19.013|1.01| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|6.009|5.964|1.01| |conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|20.169|20.009|1.01| |conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.584|23.423|0.96| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.372|0.504|0.74| |conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.426|5.456|0.99| |conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|4.945|5.221|0.95| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.210|0.261|0.81| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.720|5.997|0.95| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.149|0.161|0.93| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.154|7.225|0.99| |conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.184|7.223|0.99| |conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.324|5.343|1.00| |conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.114|5.238|0.98| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.111|0.121|0.92| |conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|25.907|26.804|0.97| |conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.695|5.654|1.01| |conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.435|27.566|1.00| |conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.944|6.164|1.13| |conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|7.180|6.717|1.07| |conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.817|6.050|1.13| |conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.225|8.660|1.07| |conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|7.496|6.625|1.13| |conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.520|36.056|0.99| |conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.990|9.702|1.03| |conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.517|10.746|0.98| |conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.702|36.731|1.00| |conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|41.035|38.280|1.07| |conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.981|10.573|1.04| |conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.863|12.384|1.04| |conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.437|54.088|0.93| |conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|50.650|50.635|1.00| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|14.696|14.606|1.01| |conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|16.201|15.426|1.05| |conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|16.061|14.292|1.12| |conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|17.743|18.250|0.97| |conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.909|78.165|1.00| |conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.579|21.879|0.99| |conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.424|19.589|1.04| |conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.389|19.461|1.00| |conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.319|20.358|1.05| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.609|21.826|1.04| |conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|25.497|25.789|0.99| |conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.966|22.108|0.99| |conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|35.883|33.470|1.07| |conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|31.041|29.314|1.06| |conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|29.922|28.145|1.06| |conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|31.624|31.148|1.02| |conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|38.564|39.164|0.98| |conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|31.502|30.269|1.04| |conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|34.248|34.589|0.99| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|130.211|134.120|0.97| |conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.490|132.874|0.96| |conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|199.834|200.081|1.00| |conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|247.346|247.523|1.00| ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake ``` force_builders=Linux AVX2,Custom Win build_image:Custom Win=msvs2019 CPU_BASELINE:Custom Win=AVX512_SKX ```pull/23289/head^2
parent
c6e5f60525
commit
e03e2e7f94
13 changed files with 3167 additions and 2976 deletions
@ -0,0 +1,259 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include "opencv2/core/hal/intrin.hpp" |
||||||
|
|
||||||
|
namespace cv { |
||||||
|
namespace dnn { |
||||||
|
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN |
||||||
|
|
||||||
|
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR); |
||||||
|
|
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX |
||||||
|
|
||||||
|
#if !CV_FMA3 // AVX workaround
|
||||||
|
#undef _mm256_fmadd_ps |
||||||
|
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) |
||||||
|
#endif |
||||||
|
|
||||||
|
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) |
||||||
|
{ |
||||||
|
CV_Assert(convMR == 4 && convNR == 24); |
||||||
|
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00; |
||||||
|
__m256 c10 = c00, c11 = c00, c12 = c00; |
||||||
|
__m256 c20 = c00, c21 = c00, c22 = c00; |
||||||
|
__m256 c30 = c00, c31 = c00, c32 = c00; |
||||||
|
|
||||||
|
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); |
||||||
|
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); |
||||||
|
|
||||||
|
for (int p = 0; p < np; p++, a += convMR, b += convNR) |
||||||
|
{ |
||||||
|
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); |
||||||
|
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); |
||||||
|
|
||||||
|
c00 = _mm256_fmadd_ps(b0, a0, c00); |
||||||
|
c01 = _mm256_fmadd_ps(b1, a0, c01); |
||||||
|
c02 = _mm256_fmadd_ps(b2, a0, c02); |
||||||
|
|
||||||
|
c10 = _mm256_fmadd_ps(b0, a1, c10); |
||||||
|
c11 = _mm256_fmadd_ps(b1, a1, c11); |
||||||
|
c12 = _mm256_fmadd_ps(b2, a1, c12); |
||||||
|
|
||||||
|
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); |
||||||
|
|
||||||
|
c20 = _mm256_fmadd_ps(b0, a0, c20); |
||||||
|
c21 = _mm256_fmadd_ps(b1, a0, c21); |
||||||
|
c22 = _mm256_fmadd_ps(b2, a0, c22); |
||||||
|
|
||||||
|
c30 = _mm256_fmadd_ps(b0, a1, c30); |
||||||
|
c31 = _mm256_fmadd_ps(b1, a1, c31); |
||||||
|
c32 = _mm256_fmadd_ps(b2, a1, c32); |
||||||
|
} |
||||||
|
|
||||||
|
if (!init_c) |
||||||
|
{ |
||||||
|
c00 = _mm256_add_ps(c00, _mm256_load_ps(c)); |
||||||
|
c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8)); |
||||||
|
c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16)); |
||||||
|
|
||||||
|
c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc)); |
||||||
|
c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8)); |
||||||
|
c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16)); |
||||||
|
|
||||||
|
c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2)); |
||||||
|
c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8)); |
||||||
|
c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16)); |
||||||
|
|
||||||
|
c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3)); |
||||||
|
c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8)); |
||||||
|
c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16)); |
||||||
|
} |
||||||
|
|
||||||
|
_mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02); |
||||||
|
_mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12); |
||||||
|
_mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22); |
||||||
|
_mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32); |
||||||
|
_mm256_zeroupper(); |
||||||
|
} |
||||||
|
|
||||||
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
|
CV_CPU_OPTIMIZATION_NAMESPACE_END |
||||||
|
|
||||||
|
// NEON code work around.
|
||||||
|
namespace opt_NEON |
||||||
|
{ |
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON |
||||||
|
|
||||||
|
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) |
||||||
|
{ |
||||||
|
#if CV_NEON_AARCH64 |
||||||
|
if (convMR == 4 && convNR == 28) // AARCH64
|
||||||
|
{ |
||||||
|
float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00; |
||||||
|
float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10; |
||||||
|
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20; |
||||||
|
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30; |
||||||
|
|
||||||
|
for( int p = 0; p < np; p++, a += convMR, b += convNR ) |
||||||
|
{ |
||||||
|
float32x4_t a0 = vld1q_f32(a), b0, b1, b2; |
||||||
|
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); |
||||||
|
|
||||||
|
c00 = vfmaq_laneq_f32(c00, b0, a0, 0); |
||||||
|
c01 = vfmaq_laneq_f32(c01, b1, a0, 0); |
||||||
|
c02 = vfmaq_laneq_f32(c02, b2, a0, 0); |
||||||
|
c10 = vfmaq_laneq_f32(c10, b0, a0, 1); |
||||||
|
c11 = vfmaq_laneq_f32(c11, b1, a0, 1); |
||||||
|
c12 = vfmaq_laneq_f32(c12, b2, a0, 1); |
||||||
|
c20 = vfmaq_laneq_f32(c20, b0, a0, 2); |
||||||
|
c21 = vfmaq_laneq_f32(c21, b1, a0, 2); |
||||||
|
c22 = vfmaq_laneq_f32(c22, b2, a0, 2); |
||||||
|
c30 = vfmaq_laneq_f32(c30, b0, a0, 3); |
||||||
|
c31 = vfmaq_laneq_f32(c31, b1, a0, 3); |
||||||
|
c32 = vfmaq_laneq_f32(c32, b2, a0, 3); |
||||||
|
|
||||||
|
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20); |
||||||
|
|
||||||
|
c03 = vfmaq_laneq_f32(c03, b0, a0, 0); |
||||||
|
c04 = vfmaq_laneq_f32(c04, b1, a0, 0); |
||||||
|
c05 = vfmaq_laneq_f32(c05, b2, a0, 0); |
||||||
|
c13 = vfmaq_laneq_f32(c13, b0, a0, 1); |
||||||
|
c14 = vfmaq_laneq_f32(c14, b1, a0, 1); |
||||||
|
c15 = vfmaq_laneq_f32(c15, b2, a0, 1); |
||||||
|
c23 = vfmaq_laneq_f32(c23, b0, a0, 2); |
||||||
|
c24 = vfmaq_laneq_f32(c24, b1, a0, 2); |
||||||
|
c25 = vfmaq_laneq_f32(c25, b2, a0, 2); |
||||||
|
c33 = vfmaq_laneq_f32(c33, b0, a0, 3); |
||||||
|
c34 = vfmaq_laneq_f32(c34, b1, a0, 3); |
||||||
|
c35 = vfmaq_laneq_f32(c35, b2, a0, 3); |
||||||
|
|
||||||
|
b0 = vld1q_f32(b + 24); |
||||||
|
c06 = vfmaq_laneq_f32(c06, b0, a0, 0); |
||||||
|
c16 = vfmaq_laneq_f32(c16, b0, a0, 1); |
||||||
|
c26 = vfmaq_laneq_f32(c26, b0, a0, 2); |
||||||
|
c36 = vfmaq_laneq_f32(c36, b0, a0, 3); |
||||||
|
} |
||||||
|
|
||||||
|
if (!init_c) |
||||||
|
{ |
||||||
|
c00 = vaddq_f32(c00, vld1q_f32(c)); |
||||||
|
c01 = vaddq_f32(c01, vld1q_f32(c + 4)); |
||||||
|
c02 = vaddq_f32(c02, vld1q_f32(c + 8)); |
||||||
|
c03 = vaddq_f32(c03, vld1q_f32(c + 12)); |
||||||
|
c04 = vaddq_f32(c04, vld1q_f32(c + 16)); |
||||||
|
c05 = vaddq_f32(c05, vld1q_f32(c + 20)); |
||||||
|
c06 = vaddq_f32(c06, vld1q_f32(c + 24)); |
||||||
|
|
||||||
|
c10 = vaddq_f32(c10, vld1q_f32(c + ldc)); |
||||||
|
c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4)); |
||||||
|
c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8)); |
||||||
|
c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12)); |
||||||
|
c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16)); |
||||||
|
c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20)); |
||||||
|
c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24)); |
||||||
|
|
||||||
|
c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2)); |
||||||
|
c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4)); |
||||||
|
c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8)); |
||||||
|
c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12)); |
||||||
|
c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16)); |
||||||
|
c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20)); |
||||||
|
c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24)); |
||||||
|
|
||||||
|
c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3)); |
||||||
|
c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4)); |
||||||
|
c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8)); |
||||||
|
c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12)); |
||||||
|
c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16)); |
||||||
|
c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20)); |
||||||
|
c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24)); |
||||||
|
} |
||||||
|
|
||||||
|
vst1q_f32(c, c00); vst1q_f32(c+4, c01); |
||||||
|
vst1q_f32(c+8, c02); vst1q_f32(c+12, c03); |
||||||
|
vst1q_f32(c+16, c04); vst1q_f32(c+20, c05); |
||||||
|
vst1q_f32(c+24, c06); |
||||||
|
|
||||||
|
vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11); |
||||||
|
vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13); |
||||||
|
vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15); |
||||||
|
vst1q_f32(c+ldc+24, c16); |
||||||
|
|
||||||
|
vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21); |
||||||
|
vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23); |
||||||
|
vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25); |
||||||
|
vst1q_f32(c+ldc*2+24, c26); |
||||||
|
|
||||||
|
vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31); |
||||||
|
vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33); |
||||||
|
vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35); |
||||||
|
vst1q_f32(c+ldc*3+24, c36); |
||||||
|
} |
||||||
|
else |
||||||
|
#endif |
||||||
|
if (convMR == 4 && convNR == 12) // ARMv7
|
||||||
|
{ |
||||||
|
float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0; |
||||||
|
float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3; |
||||||
|
float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6; |
||||||
|
float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9; |
||||||
|
|
||||||
|
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0; |
||||||
|
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); |
||||||
|
|
||||||
|
for (int p = 0; p < np; p++, a += convMR, b += convNR) |
||||||
|
{ |
||||||
|
a0 = vld1_f32(a), a1 = vld1_f32(a+2); |
||||||
|
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); |
||||||
|
|
||||||
|
c0 = vmlaq_lane_f32(c0, b0, a0, 0); |
||||||
|
c1 = vmlaq_lane_f32(c1, b1, a0, 0); |
||||||
|
c2 = vmlaq_lane_f32(c2, b2, a0, 0); |
||||||
|
|
||||||
|
c3 = vmlaq_lane_f32(c3, b0, a0, 1); |
||||||
|
c4 = vmlaq_lane_f32(c4, b1, a0, 1); |
||||||
|
c5 = vmlaq_lane_f32(c5, b2, a0, 1); |
||||||
|
|
||||||
|
c6 = vmlaq_lane_f32(c6, b0, a1, 0); |
||||||
|
c7 = vmlaq_lane_f32(c7, b1, a1, 0); |
||||||
|
c8 = vmlaq_lane_f32(c8, b2, a1, 0); |
||||||
|
|
||||||
|
c9 = vmlaq_lane_f32(c9 , b0, a1, 1); |
||||||
|
c10 = vmlaq_lane_f32(c10, b1, a1, 1); |
||||||
|
c11 = vmlaq_lane_f32(c11, b2, a1, 1); |
||||||
|
} |
||||||
|
|
||||||
|
if (!init_c) |
||||||
|
{ |
||||||
|
c0 = vaddq_f32(c0, vld1q_f32(c)); |
||||||
|
c1 = vaddq_f32(c1, vld1q_f32(c + 4)); |
||||||
|
c2 = vaddq_f32(c2, vld1q_f32(c + 8)); |
||||||
|
|
||||||
|
c3 = vaddq_f32(c3, vld1q_f32(c + ldc)); |
||||||
|
c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4)); |
||||||
|
c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8)); |
||||||
|
|
||||||
|
c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2)); |
||||||
|
c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4)); |
||||||
|
c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8)); |
||||||
|
|
||||||
|
c9 = vaddq_f32(c9 , vld1q_f32(c + ldc * 3)); |
||||||
|
c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4)); |
||||||
|
c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8)); |
||||||
|
} |
||||||
|
|
||||||
|
vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2); |
||||||
|
vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5); |
||||||
|
vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8); |
||||||
|
vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11); |
||||||
|
} |
||||||
|
else |
||||||
|
CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock"); |
||||||
|
} |
||||||
|
|
||||||
|
#endif |
||||||
|
} |
||||||
|
}} // namespace cv::dnn
|
@ -0,0 +1,591 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include "opencv2/core/hal/intrin.hpp" |
||||||
|
|
||||||
|
namespace cv { |
||||||
|
namespace dnn { |
||||||
|
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN |
||||||
|
|
||||||
|
void fastDepthwiseConv(const float* weights, |
||||||
|
int kernel_h, int kernel_w, |
||||||
|
int stride_h, int stride_w, |
||||||
|
int dilation_h, int dilation_w, |
||||||
|
int pad_t, int pad_l, |
||||||
|
const float* bias, const float* relu, |
||||||
|
const float* inptr, |
||||||
|
int height, int width, |
||||||
|
float* outptr, |
||||||
|
int out_d, int outH, int outW); |
||||||
|
|
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX |
||||||
|
|
||||||
|
#if !CV_FMA3 // AVX workaround
|
||||||
|
#undef _mm256_fmadd_ps |
||||||
|
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) |
||||||
|
#endif |
||||||
|
|
||||||
|
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b) |
||||||
|
{ |
||||||
|
__m256 t0 = _mm256_loadu_ps(ptr); |
||||||
|
__m256 t1 = _mm256_loadu_ps(ptr + 8); |
||||||
|
|
||||||
|
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16); |
||||||
|
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16); |
||||||
|
a = _mm256_shuffle_ps(lo, hi, 0x88); |
||||||
|
b = _mm256_shuffle_ps(lo, hi, 0xdd); |
||||||
|
} |
||||||
|
|
||||||
|
void fastDepthwiseConv( const float* wptr, |
||||||
|
int kernel_h, int kernel_w, |
||||||
|
int stride_h, int stride_w, |
||||||
|
int dilation_h, int dilation_w, |
||||||
|
int pad_t, int pad_l, |
||||||
|
const float* biasptr, const float* relu, |
||||||
|
const float* inptr_, |
||||||
|
int height, int width, |
||||||
|
float* outptr_, |
||||||
|
int out_d, int outH, int outW ) |
||||||
|
{ |
||||||
|
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], |
||||||
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], |
||||||
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; |
||||||
|
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); |
||||||
|
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; |
||||||
|
|
||||||
|
for (int out_i = 0; out_i < outH; out_i++) |
||||||
|
{ |
||||||
|
int in_i = out_i * stride_h - pad_t, out_j = 0; |
||||||
|
const float* imgptr0 = inptr_ + in_i*width; |
||||||
|
const float* imgptr1 = imgptr0 + dilation_h*width; |
||||||
|
const float* imgptr2 = imgptr0 + (dilation_h*2)*width; |
||||||
|
float out, w00 = w00_, w01 = w01_, w02 = w02_; |
||||||
|
float w20 = w20_, w21 = w21_, w22 = w22_; |
||||||
|
if (in_i < 0) |
||||||
|
{ |
||||||
|
w00 = w01 = w02 = 0.f; |
||||||
|
imgptr0 = imgptr1; |
||||||
|
} |
||||||
|
else if (in_i + dilation_h*(kernel_h-1) >= height) |
||||||
|
{ |
||||||
|
w20 = w21 = w22 = 0.f; |
||||||
|
imgptr2 = imgptr1; |
||||||
|
} |
||||||
|
float* outptr = outptr_ + out_i*outW; |
||||||
|
if (pad_l > 0) |
||||||
|
{ |
||||||
|
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + |
||||||
|
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + |
||||||
|
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[0] = out; |
||||||
|
out_j = 1; |
||||||
|
} |
||||||
|
|
||||||
|
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) |
||||||
|
{ |
||||||
|
const int VECSZ = 8; |
||||||
|
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02), |
||||||
|
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12), |
||||||
|
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22); |
||||||
|
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff); |
||||||
|
|
||||||
|
if( stride_w == 1 ) |
||||||
|
for( ; out_j < outW1; out_j += VECSZ ) |
||||||
|
{ |
||||||
|
if (out_j + VECSZ > outW1 && out_j > pad_l) |
||||||
|
out_j = outW1 - VECSZ; |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j), |
||||||
|
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w), |
||||||
|
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2), |
||||||
|
v10 = _mm256_loadu_ps(imgptr1 + in_j), |
||||||
|
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w), |
||||||
|
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2), |
||||||
|
v20 = _mm256_loadu_ps(imgptr2 + in_j), |
||||||
|
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w), |
||||||
|
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2); |
||||||
|
|
||||||
|
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); |
||||||
|
__m256 vout1 = _mm256_mul_ps(v01, vw01); |
||||||
|
__m256 vout2 = _mm256_mul_ps(v02, vw02); |
||||||
|
|
||||||
|
vout0 = _mm256_fmadd_ps(v10, vw10, vout0); |
||||||
|
vout1 = _mm256_fmadd_ps(v11, vw11, vout1); |
||||||
|
vout2 = _mm256_fmadd_ps(v12, vw12, vout2); |
||||||
|
|
||||||
|
vout0 = _mm256_fmadd_ps(v20, vw20, vout0); |
||||||
|
vout1 = _mm256_fmadd_ps(v21, vw21, vout1); |
||||||
|
vout2 = _mm256_fmadd_ps(v22, vw22, vout2); |
||||||
|
|
||||||
|
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); |
||||||
|
if (relu) |
||||||
|
{ |
||||||
|
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); |
||||||
|
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); |
||||||
|
} |
||||||
|
_mm256_storeu_ps(outptr + out_j, vout0); |
||||||
|
} |
||||||
|
else |
||||||
|
for( ; out_j < outW1; out_j += VECSZ ) |
||||||
|
{ |
||||||
|
if (out_j + VECSZ > outW1 && out_j > pad_l) |
||||||
|
out_j = outW1 - VECSZ; |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; |
||||||
|
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01); |
||||||
|
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); |
||||||
|
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11); |
||||||
|
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); |
||||||
|
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21); |
||||||
|
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); |
||||||
|
|
||||||
|
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); |
||||||
|
__m256 vout1 = _mm256_mul_ps(v01, vw01); |
||||||
|
__m256 vout2 = _mm256_mul_ps(v02, vw02); |
||||||
|
|
||||||
|
vout0 = _mm256_fmadd_ps(v10, vw10, vout0); |
||||||
|
vout1 = _mm256_fmadd_ps(v11, vw11, vout1); |
||||||
|
vout2 = _mm256_fmadd_ps(v12, vw12, vout2); |
||||||
|
|
||||||
|
vout0 = _mm256_fmadd_ps(v20, vw20, vout0); |
||||||
|
vout1 = _mm256_fmadd_ps(v21, vw21, vout1); |
||||||
|
vout2 = _mm256_fmadd_ps(v22, vw22, vout2); |
||||||
|
|
||||||
|
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); |
||||||
|
if (relu) |
||||||
|
{ |
||||||
|
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); |
||||||
|
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); |
||||||
|
} |
||||||
|
_mm256_storeu_ps(outptr + out_j, vout0); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
for (; out_j < outW1; out_j++) |
||||||
|
{ |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + |
||||||
|
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + |
||||||
|
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[out_j] = out; |
||||||
|
} |
||||||
|
|
||||||
|
for (; out_j < outW; out_j++ ) |
||||||
|
{ |
||||||
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; |
||||||
|
float s0 = 1.f, s1 = 1.f, s2 = 1.f; |
||||||
|
if (in_j0 >= width) |
||||||
|
{ |
||||||
|
in_j0 = 0; |
||||||
|
s0 = 0.f; |
||||||
|
} |
||||||
|
if (in_j1 >= width) |
||||||
|
{ |
||||||
|
in_j1 = 0; |
||||||
|
s1 = 0.f; |
||||||
|
} |
||||||
|
if (in_j2 >= width) |
||||||
|
{ |
||||||
|
in_j2 = 0; |
||||||
|
s2 = 0.f; |
||||||
|
} |
||||||
|
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + |
||||||
|
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + |
||||||
|
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[out_j] = out; |
||||||
|
} |
||||||
|
} |
||||||
|
_mm256_zeroupper(); |
||||||
|
} |
||||||
|
|
||||||
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV |
||||||
|
|
||||||
|
/*
|
||||||
|
Example for load_deinterleave: |
||||||
|
input: ptr[16] = {1,2,3, ... ,14,15,16} |
||||||
|
output: a = {1, 3, 5, 7, 9, 11, 13, 15} |
||||||
|
output: b = {2, 4, 6, 8,10, 12, 14, 16} |
||||||
|
*/ |
||||||
|
static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl) |
||||||
|
{ |
||||||
|
vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2); |
||||||
|
vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask); |
||||||
|
vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2); |
||||||
|
vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2); |
||||||
|
vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4(); |
||||||
|
vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2); |
||||||
|
tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2); |
||||||
|
tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2); |
||||||
|
/* The following instructions have not to be supported by the GNU toolchain.
|
||||||
|
So we temporarily use store and load instead. |
||||||
|
// a = vlmul_trunc_v_f32m4_f32m2(tempa);
|
||||||
|
// b = vlmul_trunc_v_f32m4_f32m2(tempb);
|
||||||
|
*/ |
||||||
|
cv::AutoBuffer<float> cvBuffer(sizeof(float)*vl*2); |
||||||
|
float* buffer = (float*)cvBuffer.data(); |
||||||
|
vse32_v_f32m4(buffer, tempa, vl); |
||||||
|
a = vle32_v_f32m2(buffer, vl); |
||||||
|
vse32_v_f32m4(buffer, tempb, vl); |
||||||
|
b = vle32_v_f32m2(buffer, vl); |
||||||
|
} |
||||||
|
|
||||||
|
void fastDepthwiseConv( const float* wptr, |
||||||
|
int kernel_h, int kernel_w, |
||||||
|
int stride_h, int stride_w, |
||||||
|
int dilation_h, int dilation_w, |
||||||
|
int pad_t, int pad_l, |
||||||
|
const float* biasptr, const float* relu, |
||||||
|
const float* inptr_, |
||||||
|
int height, int width, |
||||||
|
float* outptr_, |
||||||
|
int out_d, int outH, int outW ) |
||||||
|
{ |
||||||
|
int vl; |
||||||
|
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], |
||||||
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], |
||||||
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; |
||||||
|
int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); |
||||||
|
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; |
||||||
|
|
||||||
|
for (int out_i = 0; out_i < outH; out_i++) |
||||||
|
{ |
||||||
|
int in_i = out_i * stride_h - pad_t, out_j = 0; |
||||||
|
const float* imgptr0 = inptr_ + in_i*width; |
||||||
|
const float* imgptr1 = imgptr0 + dilation_h*width; |
||||||
|
const float* imgptr2 = imgptr0 + (dilation_h*2)*width; |
||||||
|
float out, w00 = w00_, w01 = w01_, w02 = w02_; |
||||||
|
float w20 = w20_, w21 = w21_, w22 = w22_; |
||||||
|
if (in_i < 0) |
||||||
|
{ |
||||||
|
w00 = w01 = w02 = 0.f; |
||||||
|
imgptr0 = imgptr1; |
||||||
|
} |
||||||
|
else if (in_i + dilation_h*(kernel_h-1) >= height) |
||||||
|
{ |
||||||
|
w20 = w21 = w22 = 0.f; |
||||||
|
imgptr2 = imgptr1; |
||||||
|
} |
||||||
|
float* outptr = outptr_ + out_i*outW; |
||||||
|
if (pad_l > 0) |
||||||
|
{ |
||||||
|
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + |
||||||
|
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + |
||||||
|
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[0] = out; |
||||||
|
out_j = 1; |
||||||
|
} |
||||||
|
|
||||||
|
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) |
||||||
|
{ |
||||||
|
int avl = outW1 - out_j; |
||||||
|
if( stride_w == 1 ) |
||||||
|
for( ; out_j < outW1; out_j += vl, avl -= vl) |
||||||
|
{ |
||||||
|
vl = vsetvl_e32m2(avl); |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl), |
||||||
|
v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl), |
||||||
|
v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl), |
||||||
|
v10 = vle32_v_f32m2(imgptr1 + in_j, vl), |
||||||
|
v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl), |
||||||
|
v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl), |
||||||
|
v20 = vle32_v_f32m2(imgptr2 + in_j, vl), |
||||||
|
v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl), |
||||||
|
v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl); |
||||||
|
|
||||||
|
vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl); |
||||||
|
vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl); |
||||||
|
vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl); |
||||||
|
vout0 = vfadd_vf_f32m2(vout0, bias, vl); |
||||||
|
|
||||||
|
vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl); |
||||||
|
vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl); |
||||||
|
vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl); |
||||||
|
|
||||||
|
vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl); |
||||||
|
vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl); |
||||||
|
vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl); |
||||||
|
|
||||||
|
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl); |
||||||
|
if (relu) |
||||||
|
{ |
||||||
|
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl); |
||||||
|
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl); |
||||||
|
} |
||||||
|
vse32_v_f32m2(outptr + out_j, vout0, vl); |
||||||
|
} |
||||||
|
else //stride_w == 2 && dilation_w == 1
|
||||||
|
for( ; out_j < outW1; out_j += vl, avl -= vl) |
||||||
|
{ |
||||||
|
vl = vsetvl_e32m2(avl); |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; |
||||||
|
vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl); |
||||||
|
vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl); |
||||||
|
vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl); |
||||||
|
vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl); |
||||||
|
vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl); |
||||||
|
vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl); |
||||||
|
|
||||||
|
vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl); |
||||||
|
vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl); |
||||||
|
vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl); |
||||||
|
vout0 = vfadd_vf_f32m2(vout0, bias, vl); |
||||||
|
|
||||||
|
vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl); |
||||||
|
vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl); |
||||||
|
vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl); |
||||||
|
|
||||||
|
vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl); |
||||||
|
vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl); |
||||||
|
vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl); |
||||||
|
|
||||||
|
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl); |
||||||
|
if (relu) |
||||||
|
{ |
||||||
|
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl); |
||||||
|
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl); |
||||||
|
} |
||||||
|
vse32_v_f32m2(outptr + out_j, vout0, vl); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
for (; out_j < outW1; out_j++) |
||||||
|
{ |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + |
||||||
|
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + |
||||||
|
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[out_j] = out; |
||||||
|
} |
||||||
|
|
||||||
|
for (; out_j < outW; out_j++ ) |
||||||
|
{ |
||||||
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; |
||||||
|
float s0 = 1.f, s1 = 1.f, s2 = 1.f; |
||||||
|
if (in_j0 >= width) |
||||||
|
{ |
||||||
|
in_j0 = 0; |
||||||
|
s0 = 0.f; |
||||||
|
} |
||||||
|
if (in_j1 >= width) |
||||||
|
{ |
||||||
|
in_j1 = 0; |
||||||
|
s1 = 0.f; |
||||||
|
} |
||||||
|
if (in_j2 >= width) |
||||||
|
{ |
||||||
|
in_j2 = 0; |
||||||
|
s2 = 0.f; |
||||||
|
} |
||||||
|
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + |
||||||
|
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + |
||||||
|
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[out_j] = out; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#endif // CV_RVV
|
||||||
|
|
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX |
||||||
|
|
||||||
|
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b) |
||||||
|
{ |
||||||
|
__m256 t0 = (__m256)__lasx_xvld(ptr, 0); |
||||||
|
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4); |
||||||
|
|
||||||
|
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16); |
||||||
|
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16); |
||||||
|
|
||||||
|
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88); |
||||||
|
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd); |
||||||
|
} |
||||||
|
|
||||||
|
void fastDepthwiseConv( const float* wptr, |
||||||
|
int kernel_h, int kernel_w, |
||||||
|
int stride_h, int stride_w, |
||||||
|
int dilation_h, int dilation_w, |
||||||
|
int pad_t, int pad_l, |
||||||
|
const float* biasptr, const float* relu, |
||||||
|
const float* inptr_, |
||||||
|
int height, int width, |
||||||
|
float* outptr_, |
||||||
|
int out_d, int outH, int outW ) |
||||||
|
{ |
||||||
|
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], |
||||||
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], |
||||||
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; |
||||||
|
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); |
||||||
|
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; |
||||||
|
|
||||||
|
for (int out_i = 0; out_i < outH; out_i++) |
||||||
|
{ |
||||||
|
int in_i = out_i * stride_h - pad_t, out_j = 0; |
||||||
|
const float* imgptr0 = inptr_ + in_i*width; |
||||||
|
const float* imgptr1 = imgptr0 + dilation_h*width; |
||||||
|
const float* imgptr2 = imgptr0 + (dilation_h*2)*width; |
||||||
|
float out, w00 = w00_, w01 = w01_, w02 = w02_; |
||||||
|
float w20 = w20_, w21 = w21_, w22 = w22_; |
||||||
|
if (in_i < 0) |
||||||
|
{ |
||||||
|
w00 = w01 = w02 = 0.f; |
||||||
|
imgptr0 = imgptr1; |
||||||
|
} |
||||||
|
else if (in_i + dilation_h*(kernel_h-1) >= height) |
||||||
|
{ |
||||||
|
w20 = w21 = w22 = 0.f; |
||||||
|
imgptr2 = imgptr1; |
||||||
|
} |
||||||
|
float* outptr = outptr_ + out_i*outW; |
||||||
|
if (pad_l > 0) |
||||||
|
{ |
||||||
|
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + |
||||||
|
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + |
||||||
|
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[0] = out; |
||||||
|
out_j = 1; |
||||||
|
} |
||||||
|
|
||||||
|
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) |
||||||
|
{ |
||||||
|
const int VECSZ = 8; |
||||||
|
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02), |
||||||
|
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12), |
||||||
|
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22); |
||||||
|
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00), |
||||||
|
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff); |
||||||
|
|
||||||
|
if( stride_w == 1 ) |
||||||
|
for( ; out_j < outW1; out_j += VECSZ ) |
||||||
|
{ |
||||||
|
if (out_j + VECSZ > outW1 && out_j > pad_l) |
||||||
|
out_j = outW1 - VECSZ; |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0), |
||||||
|
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0), |
||||||
|
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0), |
||||||
|
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0), |
||||||
|
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0), |
||||||
|
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0), |
||||||
|
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0), |
||||||
|
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0), |
||||||
|
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0); |
||||||
|
|
||||||
|
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); |
||||||
|
__m256 vout1 = __lasx_xvfmul_s(v01, vw01); |
||||||
|
__m256 vout2 = __lasx_xvfmul_s(v02, vw02); |
||||||
|
|
||||||
|
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); |
||||||
|
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); |
||||||
|
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); |
||||||
|
|
||||||
|
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); |
||||||
|
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); |
||||||
|
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); |
||||||
|
|
||||||
|
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); |
||||||
|
if (relu) |
||||||
|
{ |
||||||
|
__m256i m = __lasx_xvfcmp_clt_s(z, vout0); |
||||||
|
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); |
||||||
|
} |
||||||
|
__lasx_xvst(vout0, outptr + out_j, 0); |
||||||
|
} |
||||||
|
else |
||||||
|
for( ; out_j < outW1; out_j += VECSZ ) |
||||||
|
{ |
||||||
|
if (out_j + VECSZ > outW1 && out_j > pad_l) |
||||||
|
out_j = outW1 - VECSZ; |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; |
||||||
|
_v256_load_deinterleave(imgptr0 + in_j, v00, v01); |
||||||
|
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); |
||||||
|
_v256_load_deinterleave(imgptr1 + in_j, v10, v11); |
||||||
|
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); |
||||||
|
_v256_load_deinterleave(imgptr2 + in_j, v20, v21); |
||||||
|
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); |
||||||
|
|
||||||
|
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); |
||||||
|
__m256 vout1 = __lasx_xvfmul_s(v01, vw01); |
||||||
|
__m256 vout2 = __lasx_xvfmul_s(v02, vw02); |
||||||
|
|
||||||
|
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); |
||||||
|
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); |
||||||
|
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); |
||||||
|
|
||||||
|
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); |
||||||
|
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); |
||||||
|
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); |
||||||
|
|
||||||
|
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); |
||||||
|
if (relu) |
||||||
|
{ |
||||||
|
__m256i m = __lasx_xvfcmp_clt_s(z, vout0); |
||||||
|
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); |
||||||
|
} |
||||||
|
__lasx_xvst(vout0, outptr + out_j, 0); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
for (; out_j < outW1; out_j++) |
||||||
|
{ |
||||||
|
int in_j = out_j * stride_w - pad_l; |
||||||
|
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + |
||||||
|
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + |
||||||
|
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[out_j] = out; |
||||||
|
} |
||||||
|
|
||||||
|
for (; out_j < outW; out_j++ ) |
||||||
|
{ |
||||||
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; |
||||||
|
float s0 = 1.f, s1 = 1.f, s2 = 1.f; |
||||||
|
if (in_j0 >= width) |
||||||
|
{ |
||||||
|
in_j0 = 0; |
||||||
|
s0 = 0.f; |
||||||
|
} |
||||||
|
if (in_j1 >= width) |
||||||
|
{ |
||||||
|
in_j1 = 0; |
||||||
|
s1 = 0.f; |
||||||
|
} |
||||||
|
if (in_j2 >= width) |
||||||
|
{ |
||||||
|
in_j2 = 0; |
||||||
|
s2 = 0.f; |
||||||
|
} |
||||||
|
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + |
||||||
|
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + |
||||||
|
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; |
||||||
|
if (relu) |
||||||
|
out = out > 0.f ? out : out*relu_coeff; |
||||||
|
outptr[out_j] = out; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#endif // CV_LASX
|
||||||
|
|
||||||
|
CV_CPU_OPTIMIZATION_NAMESPACE_END |
||||||
|
}} // namespace
|
@ -0,0 +1,764 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv_Winograd.fx).
|
||||||
|
// Here is the original license:
|
||||||
|
/*
|
||||||
|
This file is a part of ficus language project. |
||||||
|
See ficus/LICENSE for the licensing terms |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "../../precomp.hpp" |
||||||
|
#include "convolution.hpp" |
||||||
|
|
||||||
|
#include "conv_winograd_f63.simd.hpp" |
||||||
|
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content |
||||||
|
|
||||||
|
namespace cv { namespace dnn { |
||||||
|
|
||||||
|
#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2 |
||||||
|
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
|
||||||
|
|
||||||
|
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
||||||
|
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); |
||||||
|
|
||||||
|
/*Input transform*/ |
||||||
|
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* outptr, int Cg, const int winoIblock, const int winoAtomF32); |
||||||
|
|
||||||
|
/*Output transform*/ |
||||||
|
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, |
||||||
|
float bias, float minval, float maxval, bool ifMinMaxAct); |
||||||
|
|
||||||
|
|
||||||
|
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, |
||||||
|
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) |
||||||
|
{ |
||||||
|
Mat input = _input.getMat(); |
||||||
|
Mat output = _output.getMat(); |
||||||
|
Mat fusedAddMat = _fusedAddMat.getMat(); |
||||||
|
|
||||||
|
MatShape inputShape = shape(input); |
||||||
|
MatShape outputShape = shape(output); |
||||||
|
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); |
||||||
|
|
||||||
|
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
|
||||||
|
int K = conv->K; |
||||||
|
int H0 = outputShape[2], W0 = outputShape[3]; |
||||||
|
|
||||||
|
int pad_top = conv->pad_top; |
||||||
|
int pad_left = conv->pad_left; |
||||||
|
|
||||||
|
int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups; |
||||||
|
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK; |
||||||
|
const size_t inp_planesize = (size_t)Hi*Wi; |
||||||
|
const size_t out_planesize = (size_t)H0*W0; |
||||||
|
|
||||||
|
int blocks_per_row = (W0+CONV_WINO_STEP-1)/CONV_WINO_STEP; |
||||||
|
int blocks_per_plane = ((H0+CONV_WINO_STEP-1)/CONV_WINO_STEP)*blocks_per_row; |
||||||
|
int blocks_per_plane_aligned = ((blocks_per_plane + |
||||||
|
CONV_WINO_IBLOCK-1)/CONV_WINO_IBLOCK)*CONV_WINO_IBLOCK; |
||||||
|
|
||||||
|
size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*CONV_WINO_AREA; |
||||||
|
|
||||||
|
AutoBuffer<float> _buf; |
||||||
|
_buf.allocate(totalbufsize + VEC_ALIGN); |
||||||
|
float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN); |
||||||
|
|
||||||
|
float* inp = input.ptr<float>(); |
||||||
|
float* out = output.ptr<float>(); |
||||||
|
|
||||||
|
float* fusedAddPtr = fusedAddMat.empty() ? nullptr : fusedAddMat.ptr<float>(); |
||||||
|
|
||||||
|
// Phase 1. compute forward Winograd transforms for all input blocks,
|
||||||
|
// all input planes, all samples in the batch.
|
||||||
|
// [TODO]: maybe, if there are too many input channels, it makes sense to
|
||||||
|
// transform only part of input channels at once and then compute the partial
|
||||||
|
// accumulated sums (i.e. update the output buffers several times,
|
||||||
|
// rather than compute them in one pass).
|
||||||
|
parallel_for_(Range(0, ntasks), [&](const Range& r0) { |
||||||
|
for (int task_id = r0.start; task_id < r0.end; task_id++) |
||||||
|
{ |
||||||
|
int nc0 = (N*C)*task_id/ntasks; |
||||||
|
int nc1 = (N*C)*(task_id+1)/ntasks; |
||||||
|
for(; nc0 < nc1; nc0++) |
||||||
|
{ |
||||||
|
int n = nc0 / C; |
||||||
|
int c = nc0 - n*C; |
||||||
|
int g = c / Cg; |
||||||
|
c -= g*Cg; |
||||||
|
for (int block_id = 0; block_id < blocks_per_plane; block_id += CONV_WINO_IBLOCK) |
||||||
|
{ |
||||||
|
for (int db = 0; db < CONV_WINO_IBLOCK; db++) |
||||||
|
{ |
||||||
|
size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + |
||||||
|
block_id)*Cg*CONV_WINO_AREA + |
||||||
|
(c*CONV_WINO_IBLOCK + db)*CONV_WINO_ATOM_F32; |
||||||
|
float* inwptr = (float*)wbuf_all + inwofs; |
||||||
|
|
||||||
|
if (block_id + db < blocks_per_plane) |
||||||
|
{ |
||||||
|
int y0 = (block_id + db) / blocks_per_row; |
||||||
|
int x0 = (block_id + db) - y0 * blocks_per_row; |
||||||
|
y0 = y0*CONV_WINO_STEP - pad_top; |
||||||
|
x0 = x0*CONV_WINO_STEP - pad_left; |
||||||
|
bool partial = y0 < 0 || y0 + CONV_WINO_SIZE > Hi || |
||||||
|
x0 < 0 || x0 + CONV_WINO_SIZE > Wi; |
||||||
|
int dx1 = 0, dx2 = CONV_WINO_SIZE, dy1 = 0, dy2 = CONV_WINO_SIZE; |
||||||
|
int inpstep = Wi; |
||||||
|
|
||||||
|
float inpbuf[CONV_WINO_AREA]; |
||||||
|
float* inptr0 = (float*)inp + nc0*inp_planesize + y0*Wi + x0; |
||||||
|
float* inptr = inptr0; |
||||||
|
|
||||||
|
if (partial) |
||||||
|
{ |
||||||
|
memset(inpbuf, 0, sizeof(inpbuf)); |
||||||
|
dy1 = -y0 > 0 ? -y0 : 0; |
||||||
|
dy2 = Hi - y0 < CONV_WINO_SIZE ? Hi - y0 : CONV_WINO_SIZE; |
||||||
|
|
||||||
|
if (dy2 < dy1) {dy2 = dy1 = 0;} |
||||||
|
dx1 = -x0 > 0 ? -x0 : 0; |
||||||
|
dx2 = Wi - x0 < CONV_WINO_SIZE ? Wi - x0 : CONV_WINO_SIZE; |
||||||
|
|
||||||
|
if (dx2 < dx1) {dx2 = dx1 = 0;} |
||||||
|
inptr0 -= y0*Wi + x0; |
||||||
|
|
||||||
|
if (dx1 < dx2 && dy1 < dy2) |
||||||
|
{ |
||||||
|
for(int dy = dy1; dy < dy2; dy++) |
||||||
|
memcpy(&inpbuf[dy*CONV_WINO_SIZE + dx1], |
||||||
|
inptr0 + (y0+dy)*Wi + (x0+dx1), |
||||||
|
(dx2-dx1)*sizeof(inpbuf[0])); |
||||||
|
} |
||||||
|
|
||||||
|
inptr = inpbuf; |
||||||
|
inpstep = CONV_WINO_SIZE; |
||||||
|
} |
||||||
|
#if CV_TRY_AVX2 |
||||||
|
if (conv->useAVX2) |
||||||
|
opt_AVX2::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); |
||||||
|
else |
||||||
|
#endif |
||||||
|
#if CV_TRY_AVX |
||||||
|
if (conv->useAVX) |
||||||
|
opt_AVX::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); |
||||||
|
else |
||||||
|
#endif |
||||||
|
#if CV_NEON && CV_NEON_AARCH64 |
||||||
|
if (conv->useNEON) |
||||||
|
opt_NEON::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); |
||||||
|
else |
||||||
|
#endif |
||||||
|
winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, inwptr += CONV_WINO_IBLOCK*CONV_WINO_ATOM_F32) |
||||||
|
memset(inwptr, 0, CONV_WINO_ATOM_F32*sizeof(inwptr[0])); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
}}); |
||||||
|
|
||||||
|
// Phase 2. compute elemwise-weighted sums of transformed blocks,
|
||||||
|
// apply inverse Winograd transforms to the sums,
|
||||||
|
// add bias, apply activation function if any and store the results.
|
||||||
|
parallel_for_(Range(0, ntasks), [&](const Range& r0) { |
||||||
|
for (int task_id = r0.start; task_id < r0.end; task_id++) |
||||||
|
{ |
||||||
|
size_t out_wbuf_size = CONV_WINO_AREA*CONV_WINO_KBLOCK*CONV_WINO_IBLOCK; |
||||||
|
size_t outbuf_size = CONV_WINO_AREA; |
||||||
|
AutoBuffer<float> out_wbuf_, outbuf_; |
||||||
|
out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN); |
||||||
|
float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN); |
||||||
|
outbuf_.allocate(outbuf_size + VEC_ALIGN); |
||||||
|
float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN); |
||||||
|
|
||||||
|
memset(out_wbuf, 0, out_wbuf_size * sizeof(float)); |
||||||
|
memset(outbuf, 0, outbuf_size * sizeof(float)); |
||||||
|
|
||||||
|
int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks); |
||||||
|
int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks); |
||||||
|
|
||||||
|
for(; ngk0 < ngk1; ngk0++) |
||||||
|
{ |
||||||
|
int n = ngk0 / (Kg_nblocks*ngroups); |
||||||
|
int gk0 = ngk0 % (Kg_nblocks*ngroups); |
||||||
|
int g = gk0 / Kg_nblocks; |
||||||
|
int k0 = (gk0 % Kg_nblocks)*CONV_WINO_KBLOCK; |
||||||
|
int k1 = k0 + CONV_WINO_KBLOCK <= Kg ? k0 + CONV_WINO_KBLOCK : Kg; |
||||||
|
|
||||||
|
for (int block_id0 = 0; block_id0 < blocks_per_plane; block_id0 += CONV_WINO_IBLOCK) |
||||||
|
{ |
||||||
|
int block_id1 = block_id0 + CONV_WINO_IBLOCK; |
||||||
|
block_id1 = block_id1 < blocks_per_plane ? block_id1 : blocks_per_plane; |
||||||
|
size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*CONV_WINO_AREA; |
||||||
|
size_t wofs = (g*Kg_nblocks*CONV_WINO_KBLOCK + k0)*Cg*CONV_WINO_AREA; |
||||||
|
|
||||||
|
float* inwptr = wbuf_all + inwofs; |
||||||
|
const float* wptr = conv->weightsWinoBufPtr + wofs; |
||||||
|
|
||||||
|
#if CV_TRY_AVX2 |
||||||
|
if (conv->useAVX2) |
||||||
|
opt_AVX2::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, |
||||||
|
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); |
||||||
|
else |
||||||
|
#endif |
||||||
|
#if CV_TRY_AVX |
||||||
|
if (conv->useAVX) |
||||||
|
opt_AVX::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, |
||||||
|
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); |
||||||
|
else |
||||||
|
#endif |
||||||
|
#if CV_NEON && CV_NEON_AARCH64 |
||||||
|
if (conv->useNEON) |
||||||
|
opt_NEON::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, |
||||||
|
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); |
||||||
|
else |
||||||
|
#endif |
||||||
|
|
||||||
|
winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, |
||||||
|
CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); |
||||||
|
for (int k = k0; k < k1; k++) |
||||||
|
{ |
||||||
|
float biasv = conv->biasBuf[g*Kg + k]; |
||||||
|
for (int block_id = block_id0; block_id < block_id1; block_id++) |
||||||
|
{ |
||||||
|
int y0 = block_id / blocks_per_row; |
||||||
|
int x0 = block_id - y0 * blocks_per_row; |
||||||
|
y0 = y0*CONV_WINO_STEP; |
||||||
|
x0 = x0*CONV_WINO_STEP; |
||||||
|
int dy1 = H0 - y0; |
||||||
|
if (dy1 > CONV_WINO_STEP) dy1 = CONV_WINO_STEP; |
||||||
|
int dx1 = W0 - x0; |
||||||
|
if (dx1 > CONV_WINO_STEP) dx1 = CONV_WINO_STEP; |
||||||
|
assert(dx1 > 0 && dy1 > 0); |
||||||
|
bool partial = activ || dy1 < CONV_WINO_STEP || dx1 < CONV_WINO_STEP; |
||||||
|
size_t outofs = (n*K + g*Kg + k)*out_planesize + y0*W0 + x0; |
||||||
|
int outstep = W0; |
||||||
|
|
||||||
|
float* outptr0 = (float*)out + outofs; |
||||||
|
float* pbptr0 = fusedAddPtr ? fusedAddPtr + outofs : nullptr; |
||||||
|
float *outptr = outptr0, *bpptr = pbptr0; |
||||||
|
|
||||||
|
if (partial) |
||||||
|
{ |
||||||
|
outptr = outbuf; |
||||||
|
outstep = CONV_WINO_SIZE; |
||||||
|
if (pbptr0) |
||||||
|
{ |
||||||
|
bpptr = outbuf; |
||||||
|
for (int y = 0; y < dy1; y++) |
||||||
|
memcpy(outbuf + y*CONV_WINO_SIZE, pbptr0 + y*W0, |
||||||
|
dx1*sizeof(pbptr0[0])); |
||||||
|
} |
||||||
|
} |
||||||
|
#if CV_TRY_AVX2 |
||||||
|
if (conv->useAVX2) |
||||||
|
opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, |
||||||
|
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); |
||||||
|
else |
||||||
|
#endif |
||||||
|
#if CV_TRY_AVX |
||||||
|
if (conv->useAVX) |
||||||
|
opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, |
||||||
|
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); |
||||||
|
else |
||||||
|
#endif |
||||||
|
#if CV_NEON && CV_NEON_AARCH64 |
||||||
|
if (conv->useNEON) |
||||||
|
// NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
|
||||||
|
opt_NEON::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, |
||||||
|
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); |
||||||
|
else |
||||||
|
#endif |
||||||
|
winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, |
||||||
|
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); |
||||||
|
if (partial) |
||||||
|
{ |
||||||
|
if (activ) |
||||||
|
activ->forwardSlice(outptr, outptr, CONV_WINO_SIZE*CONV_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1); |
||||||
|
for (int y = 0; y < dy1; y++) |
||||||
|
memcpy(outptr0 + y*W0, outptr + y*CONV_WINO_SIZE,dx1*sizeof(outptr0[0])); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
}}); |
||||||
|
return 1; |
||||||
|
} |
||||||
|
|
||||||
|
/****************************************************************************************\
|
||||||
|
SIMD for winograd function |
||||||
|
\****************************************************************************************/ |
||||||
|
|
||||||
|
#if CV_SIMD128 |
||||||
|
|
||||||
|
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
||||||
|
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) |
||||||
|
{ |
||||||
|
#if 1 |
||||||
|
CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4); |
||||||
|
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
||||||
|
outbuf += winoAtomF32) |
||||||
|
{ |
||||||
|
v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00; |
||||||
|
v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00; |
||||||
|
v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00; |
||||||
|
v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00; |
||||||
|
|
||||||
|
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
||||||
|
wptr += winoKblock*winoAtomF32) |
||||||
|
{ |
||||||
|
v_float32x4 x0, x1, x2; |
||||||
|
x0 = v_load(inwptr); |
||||||
|
x1 = v_load(inwptr + 4); |
||||||
|
x2 = v_load(inwptr + 8); |
||||||
|
|
||||||
|
v_float32x4 w0 = v_load(wptr); |
||||||
|
s00 = v_fma(w0, x0, s00); |
||||||
|
s01 = v_fma(w0, x1, s01); |
||||||
|
s02 = v_fma(w0, x2, s02); |
||||||
|
|
||||||
|
w0 = v_load(wptr + 4); |
||||||
|
s10 = v_fma(w0, x0, s10); |
||||||
|
s11 = v_fma(w0, x1, s11); |
||||||
|
s12 = v_fma(w0, x2, s12); |
||||||
|
|
||||||
|
w0 = v_load(wptr + 8); |
||||||
|
s20 = v_fma(w0, x0, s20); |
||||||
|
s21 = v_fma(w0, x1, s21); |
||||||
|
s22 = v_fma(w0, x2, s22); |
||||||
|
|
||||||
|
w0 = v_load(wptr + 12); |
||||||
|
s30 = v_fma(w0, x0, s30); |
||||||
|
s31 = v_fma(w0, x1, s31); |
||||||
|
s32 = v_fma(w0, x2, s32); |
||||||
|
} |
||||||
|
|
||||||
|
v_store(outbuf, s00); |
||||||
|
v_store(outbuf + 1*64, s01); |
||||||
|
v_store(outbuf + 2*64, s02); |
||||||
|
v_store(outbuf + 3*64, s10); |
||||||
|
v_store(outbuf + 4*64, s11); |
||||||
|
v_store(outbuf + 5*64, s12); |
||||||
|
v_store(outbuf + 6*64, s20); |
||||||
|
v_store(outbuf + 7*64, s21); |
||||||
|
v_store(outbuf + 8*64, s22); |
||||||
|
v_store(outbuf + 9*64, s30); |
||||||
|
v_store(outbuf + 10*64, s31); |
||||||
|
v_store(outbuf + 11*64, s32); |
||||||
|
} |
||||||
|
#else |
||||||
|
// Naive C++ code, the code should never be run here.
|
||||||
|
for (int atom_id = 0; atom_id < winoNatomF32; |
||||||
|
atom_id++, outbuf += winoAtomF32) |
||||||
|
{ |
||||||
|
float sumbuf[winoIblock*winoKblock*winoAtomF32]; |
||||||
|
memset(sumbuf, 0, sizeof(sumbuf)); |
||||||
|
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
||||||
|
wptr += winoKblock*winoAtomF32) |
||||||
|
{ |
||||||
|
for (int i = 0; i < winoKblock; i++) |
||||||
|
{ |
||||||
|
for (int j = 0; j < winoIblock; j++) |
||||||
|
{ |
||||||
|
int i_ = i*winoAtomF32; |
||||||
|
int j_ = j*winoAtomF32; |
||||||
|
int ij_ = i_*winoIblock + j_; |
||||||
|
float s0 = inwptr[j_ + 0]*wptr[i_ + 0]; |
||||||
|
float s1 = inwptr[j_ + 1]*wptr[i_ + 1]; |
||||||
|
float s2 = inwptr[j_ + 2]*wptr[i_ + 2]; |
||||||
|
float s3 = inwptr[j_ + 3]*wptr[i_ + 3]; |
||||||
|
sumbuf[ij_ + 0] += s0; |
||||||
|
sumbuf[ij_ + 1] += s1; |
||||||
|
sumbuf[ij_ + 2] += s2; |
||||||
|
sumbuf[ij_ + 3] += s3; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
for (int ij = 0; ij < winoKblock*winoIblock; ij++) |
||||||
|
{ |
||||||
|
int ij_ = ij*winoAtomF32; |
||||||
|
int ij_out = ij*CONV_WINO_AREA; |
||||||
|
outbuf[ij_out + 0] = sumbuf[ij_ + 0]; |
||||||
|
outbuf[ij_out + 1] = sumbuf[ij_ + 1]; |
||||||
|
outbuf[ij_out + 2] = sumbuf[ij_ + 2]; |
||||||
|
outbuf[ij_out + 3] = sumbuf[ij_ + 3]; |
||||||
|
} |
||||||
|
} |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
/*Input transform*/ |
||||||
|
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* outptr, int Cg, const int winoIblock, const int winoAtomF32) |
||||||
|
{ |
||||||
|
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4); |
||||||
|
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); |
||||||
|
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); |
||||||
|
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); |
||||||
|
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); |
||||||
|
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); |
||||||
|
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); |
||||||
|
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); |
||||||
|
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); |
||||||
|
|
||||||
|
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; |
||||||
|
|
||||||
|
{ |
||||||
|
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ |
||||||
|
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ |
||||||
|
v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11; |
||||||
|
t00 = x40 - x20; |
||||||
|
t01 = x41 - x21; |
||||||
|
t10 = x30 - x50; |
||||||
|
t11 = x31 - x51; |
||||||
|
v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60); |
||||||
|
v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61); |
||||||
|
v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10); |
||||||
|
v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11); |
||||||
|
|
||||||
|
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ |
||||||
|
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ |
||||||
|
v_float32x4 qm4_25 = v_setall_f32(-4.25f); |
||||||
|
t00 = v_fma(x30, qm4_25, x10 + x50); |
||||||
|
t01 = v_fma(x31, qm4_25, x11 + x51); |
||||||
|
t10 = v_fma(x40, qm4_25, x20 + x60); |
||||||
|
t11 = v_fma(x41, qm4_25, x21 + x61); |
||||||
|
|
||||||
|
v_float32x4 y10 = t00 + t10, y11 = t01 + t11; |
||||||
|
v_float32x4 y20 = t10 - t00, y21 = t11 - t01; |
||||||
|
|
||||||
|
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ |
||||||
|
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ |
||||||
|
v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f); |
||||||
|
v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f); |
||||||
|
t00 = v_fma(x10, q0_5, x50 + x50); |
||||||
|
t01 = v_fma(x11, q0_5, x51 + x51); |
||||||
|
t10 = v_fma(x20, q0_25, x60); |
||||||
|
t11 = v_fma(x21, q0_25, x61); |
||||||
|
t00 = v_fma(x30, qm2_5, t00); |
||||||
|
t01 = v_fma(x31, qm2_5, t01); |
||||||
|
t10 = v_fma(x40, qm1_25, t10); |
||||||
|
t11 = v_fma(x41, qm1_25, t11); |
||||||
|
|
||||||
|
v_float32x4 y30 = t00 + t10, y31 = t01 + t11; |
||||||
|
v_float32x4 y40 = t10 - t00, y41 = t11 - t01; |
||||||
|
|
||||||
|
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ |
||||||
|
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ |
||||||
|
v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f); |
||||||
|
t00 = v_fma(x50, q0_5, x10 + x10); |
||||||
|
t01 = v_fma(x51, q0_5, x11 + x11); |
||||||
|
t10 = v_fma(x20, q4 , x60); |
||||||
|
t11 = v_fma(x21, q4 , x61); |
||||||
|
t00 = v_fma(x30, qm2_5, t00); |
||||||
|
t01 = v_fma(x31, qm2_5, t01); |
||||||
|
t10 = v_fma(x40, qm5 , t10); |
||||||
|
t11 = v_fma(x41, qm5 , t11); |
||||||
|
|
||||||
|
v_float32x4 y50 = t00 + t10, y51 = t01 + t11; |
||||||
|
v_float32x4 y60 = t10 - t00, y61 = t11 - t01; |
||||||
|
|
||||||
|
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
||||||
|
/* Y: */ |
||||||
|
/* y00 y01 */ |
||||||
|
/* y10 y11 */ |
||||||
|
/* ... */ |
||||||
|
/* y70 y71 */ |
||||||
|
/* Y': */ |
||||||
|
/* y00 y40 */ |
||||||
|
/* y10 y50 */ |
||||||
|
/* y20 y60 */ |
||||||
|
/* y30 y70 */ |
||||||
|
/* y01 y41 */ |
||||||
|
/* y11 y51 */ |
||||||
|
/* y21 y61 */ |
||||||
|
/* y31 y71 */ |
||||||
|
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ |
||||||
|
|
||||||
|
v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30); |
||||||
|
v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31); |
||||||
|
v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70); |
||||||
|
v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71); |
||||||
|
|
||||||
|
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ |
||||||
|
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ |
||||||
|
t00 = y01 - y20; |
||||||
|
t01 = y41 - y60; |
||||||
|
t10 = y30 - y11; |
||||||
|
t11 = y70 - y51; |
||||||
|
z00 = v_fma(t00, q5_25, y00 - y21); |
||||||
|
z01 = v_fma(t01, q5_25, y40 - y61); |
||||||
|
z70 = v_fma(t10, q5_25, y31 - y10); |
||||||
|
z71 = v_fma(t11, q5_25, y71 - y50); |
||||||
|
|
||||||
|
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ |
||||||
|
t00 = v_fma(y30, qm4_25, y10 + y11); |
||||||
|
t01 = v_fma(y70, qm4_25, y50 + y51); |
||||||
|
t10 = v_fma(y01, qm4_25, y20 + y21); |
||||||
|
t11 = v_fma(y41, qm4_25, y60 + y61); |
||||||
|
|
||||||
|
z10 = t00 + t10; z11 = t01 + t11; |
||||||
|
z20 = t10 - t00; z21 = t11 - t01; |
||||||
|
|
||||||
|
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ |
||||||
|
t00 = v_fma(y10, q0_5, y11 + y11); |
||||||
|
t01 = v_fma(y50, q0_5, y51 + y51); |
||||||
|
t10 = v_fma(y20, q0_25, y21); |
||||||
|
t11 = v_fma(y60, q0_25, y61); |
||||||
|
t00 = v_fma(y30, qm2_5, t00); |
||||||
|
t01 = v_fma(y70, qm2_5, t01); |
||||||
|
t10 = v_fma(y01, qm1_25, t10); |
||||||
|
t11 = v_fma(y41, qm1_25, t11); |
||||||
|
|
||||||
|
z30 = t00 + t10; z31 = t01 + t11; |
||||||
|
z40 = t10 - t00; z41 = t11 - t01; |
||||||
|
|
||||||
|
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ |
||||||
|
t00 = v_fma(y11, q0_5, y10 + y10); |
||||||
|
t01 = v_fma(y51, q0_5, y50 + y50); |
||||||
|
t10 = v_fma(y20, q4, y21); |
||||||
|
t11 = v_fma(y60, q4, y61); |
||||||
|
t00 = v_fma(y30, qm2_5, t00); |
||||||
|
t01 = v_fma(y70, qm2_5, t01); |
||||||
|
t10 = v_fma(y01, qm5, t10); |
||||||
|
t11 = v_fma(y41, qm5, t11); |
||||||
|
|
||||||
|
z50 = t00 + t10; z51 = t01 + t11; |
||||||
|
z60 = t10 - t00; z61 = t11 - t01; |
||||||
|
} |
||||||
|
|
||||||
|
const int outstep = winoIblock*winoAtomF32*Cg; |
||||||
|
|
||||||
|
v_store(outptr, z00); |
||||||
|
v_store(outptr + outstep, z01); |
||||||
|
v_store(outptr + outstep*2, z10); |
||||||
|
v_store(outptr + outstep*3, z11); |
||||||
|
v_store(outptr + outstep*4, z20); |
||||||
|
v_store(outptr + outstep*5, z21); |
||||||
|
v_store(outptr + outstep*6, z30); |
||||||
|
v_store(outptr + outstep*7, z31); |
||||||
|
v_store(outptr + outstep*8, z40); |
||||||
|
v_store(outptr + outstep*9, z41); |
||||||
|
v_store(outptr + outstep*10, z50); |
||||||
|
v_store(outptr + outstep*11, z51); |
||||||
|
v_store(outptr + outstep*12, z60); |
||||||
|
v_store(outptr + outstep*13, z61); |
||||||
|
v_store(outptr + outstep*14, z70); |
||||||
|
v_store(outptr + outstep*15, z71); |
||||||
|
} |
||||||
|
|
||||||
|
/*Output transform*/ |
||||||
|
/* Inverse Winograd 8x8 transform:
|
||||||
|
out = (A'*inp*A)', where |
||||||
|
inp is input 8x8 FP32 matrix, |
||||||
|
A' is |
||||||
|
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, |
||||||
|
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, |
||||||
|
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, |
||||||
|
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, |
||||||
|
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, |
||||||
|
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] |
||||||
|
|
||||||
|
inp is pre-loaded into xij registers, |
||||||
|
out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1. |
||||||
|
|
||||||
|
After the inverse transform is done, we add bias, |
||||||
|
optionally add results from the earlier tensors (by-pass), |
||||||
|
optionally apply activation function and then |
||||||
|
store the final results. |
||||||
|
|
||||||
|
That is, after both forward and then inverse transformation, |
||||||
|
we get non-transposed result. |
||||||
|
Of course, for the correct work of Winograd-based convolution, |
||||||
|
the Winograd-transformed weights should also be transposed. |
||||||
|
init_conv() (see OpConv.fx) takes care of that. |
||||||
|
*/ |
||||||
|
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* bpptr, int bpstep, float* outptr, int outstep, |
||||||
|
float bias, float minval, float maxval, bool ifMinMaxAct) |
||||||
|
{ |
||||||
|
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4); |
||||||
|
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); |
||||||
|
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); |
||||||
|
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); |
||||||
|
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); |
||||||
|
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); |
||||||
|
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); |
||||||
|
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); |
||||||
|
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); |
||||||
|
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; |
||||||
|
|
||||||
|
{ |
||||||
|
v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; |
||||||
|
s12_0 = x10 + x20; s12_1 = x11 + x21; |
||||||
|
s34_0 = x30 + x40; s34_1 = x31 + x41; |
||||||
|
s56_0 = x50 + x60; s56_1 = x51 + x61; |
||||||
|
|
||||||
|
v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0; |
||||||
|
v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1; |
||||||
|
|
||||||
|
v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); |
||||||
|
v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) ); |
||||||
|
|
||||||
|
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); |
||||||
|
v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
s12_0 = x10 - x20; s12_1 = x11 - x21; |
||||||
|
s34_0 = x30 - x40; s34_1 = x31 - x41; |
||||||
|
s56_0 = x50 - x60; s56_1 = x51 - x61; |
||||||
|
|
||||||
|
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f); |
||||||
|
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0)); |
||||||
|
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1)); |
||||||
|
|
||||||
|
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f); |
||||||
|
v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f); |
||||||
|
v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60; |
||||||
|
|
||||||
|
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
||||||
|
/* Y: */ |
||||||
|
/* y00 y01 */ |
||||||
|
/* y10 y11 */ |
||||||
|
/* ... */ |
||||||
|
/* y50 y51 */ |
||||||
|
/* 0 0 */ |
||||||
|
/* 0 0 */ |
||||||
|
/* Y': */ |
||||||
|
/* y00 y40 */ |
||||||
|
/* y10 y50 */ |
||||||
|
/* y20 y60 */ |
||||||
|
/* y30 y70 */ |
||||||
|
/* y01 y41 */ |
||||||
|
/* y11 y51 */ |
||||||
|
/* y21 y61 */ |
||||||
|
/* y31 y71 */ |
||||||
|
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ |
||||||
|
|
||||||
|
v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30); |
||||||
|
v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31); |
||||||
|
v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70); |
||||||
|
v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71); |
||||||
|
|
||||||
|
s12_0 = y10 + y20; s12_1 = y50 + y60; |
||||||
|
s34_0 = y30 + y01; s34_1 = y70 + y41; |
||||||
|
s56_0 = y11 + y21; s56_1 = y51 + y61; |
||||||
|
|
||||||
|
z00 = y00 + s12_0 + s34_0 + s56_0; |
||||||
|
z01 = y40 + s12_1 + s34_1 + s56_1; |
||||||
|
|
||||||
|
a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); |
||||||
|
z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); |
||||||
|
z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
s12_0 = y10 - y20; s12_1 = y50 - y60; |
||||||
|
s34_0 = y30 - y01; s34_1 = y70 - y41; |
||||||
|
s56_0 = y11 - y21; s56_1 = y51 - y61; |
||||||
|
|
||||||
|
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f); |
||||||
|
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y31 + s12_0)); |
||||||
|
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y71 + s12_1)); |
||||||
|
|
||||||
|
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f); |
||||||
|
z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f); |
||||||
|
z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); |
||||||
|
z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); |
||||||
|
|
||||||
|
v_float32x4 vbias = v_setall_f32(bias); |
||||||
|
z00 += vbias; |
||||||
|
z01 += vbias; |
||||||
|
z10 += vbias; |
||||||
|
z11 += vbias; |
||||||
|
z20 += vbias; |
||||||
|
z21 += vbias; |
||||||
|
z30 += vbias; |
||||||
|
z31 += vbias; |
||||||
|
z40 += vbias; |
||||||
|
z41 += vbias; |
||||||
|
z50 += vbias; |
||||||
|
z51 += vbias; |
||||||
|
} |
||||||
|
|
||||||
|
if (bpptr) |
||||||
|
{ |
||||||
|
z00 += v_load(bpptr); |
||||||
|
z01 += v_load_low(bpptr + 4); |
||||||
|
z10 += v_load(bpptr + bpstep); |
||||||
|
z11 += v_load_low(bpptr + bpstep + 4); |
||||||
|
z20 += v_load(bpptr + bpstep*2); |
||||||
|
z21 += v_load_low(bpptr + bpstep*2 + 4); |
||||||
|
z30 += v_load(bpptr + bpstep*3); |
||||||
|
z31 += v_load_low(bpptr + bpstep*3 + 4); |
||||||
|
z40 += v_load(bpptr + bpstep*4); |
||||||
|
z41 += v_load_low(bpptr + bpstep*4 + 4); |
||||||
|
z50 += v_load(bpptr + bpstep*5); |
||||||
|
z51 += v_load_low(bpptr + bpstep*5 + 4); |
||||||
|
} |
||||||
|
|
||||||
|
if (ifMinMaxAct) |
||||||
|
{ |
||||||
|
v_float32x4 vmax = v_setall_f32(maxval); |
||||||
|
v_float32x4 vmin = v_setall_f32(minval); |
||||||
|
|
||||||
|
z00 = v_min(v_max(z00, vmin), vmax); |
||||||
|
z01 = v_min(v_max(z01, vmin), vmax); |
||||||
|
z10 = v_min(v_max(z10, vmin), vmax); |
||||||
|
z11 = v_min(v_max(z11, vmin), vmax); |
||||||
|
z20 = v_min(v_max(z20, vmin), vmax); |
||||||
|
z21 = v_min(v_max(z21, vmin), vmax); |
||||||
|
z30 = v_min(v_max(z30, vmin), vmax); |
||||||
|
z31 = v_min(v_max(z31, vmin), vmax); |
||||||
|
z40 = v_min(v_max(z40, vmin), vmax); |
||||||
|
z41 = v_min(v_max(z41, vmin), vmax); |
||||||
|
z50 = v_min(v_max(z50, vmin), vmax); |
||||||
|
z51 = v_min(v_max(z51, vmin), vmax); |
||||||
|
} |
||||||
|
|
||||||
|
v_store(outptr, z00); |
||||||
|
v_store_low(outptr + 4, z01); |
||||||
|
v_store(outptr + outstep, z10); |
||||||
|
v_store_low(outptr + outstep + 4, z11); |
||||||
|
v_store(outptr + outstep*2, z20); |
||||||
|
v_store_low(outptr + outstep*2 + 4, z21); |
||||||
|
v_store(outptr + outstep*3, z30); |
||||||
|
v_store_low(outptr + outstep*3 + 4, z31); |
||||||
|
v_store(outptr + outstep*4, z40); |
||||||
|
v_store_low(outptr + outstep*4 + 4, z41); |
||||||
|
v_store(outptr + outstep*5, z50); |
||||||
|
v_store_low(outptr + outstep*5 + 4, z51); |
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
#else |
||||||
|
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, |
||||||
|
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) |
||||||
|
{ |
||||||
|
return 0; |
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
}} // namespace cv::dnn
|
@ -0,0 +1,886 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include "opencv2/core/hal/intrin.hpp" |
||||||
|
|
||||||
|
namespace cv { |
||||||
|
namespace dnn { |
||||||
|
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN |
||||||
|
|
||||||
|
/* Accumulate */ |
||||||
|
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
||||||
|
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); |
||||||
|
|
||||||
|
/*Input transform*/ |
||||||
|
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* outptr, int Cg, const int winoIblock, const int winoAtomF32); |
||||||
|
|
||||||
|
/*Output transform*/ |
||||||
|
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* bpptr, int bpstep, float* outptr, int outstep, |
||||||
|
float bias, float minval, float maxval, bool ifMinMaxAct); |
||||||
|
|
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX |
||||||
|
|
||||||
|
#if !CV_FMA3 // AVX workaround
|
||||||
|
#undef _mm256_fmadd_ps |
||||||
|
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) |
||||||
|
#endif |
||||||
|
|
||||||
|
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
||||||
|
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) |
||||||
|
{ |
||||||
|
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 8); |
||||||
|
if (iblock > 3) |
||||||
|
{ |
||||||
|
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
||||||
|
outbuf += winoAtomF32) |
||||||
|
{ |
||||||
|
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; |
||||||
|
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; |
||||||
|
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; |
||||||
|
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; |
||||||
|
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
||||||
|
wptr += winoKblock*winoAtomF32) |
||||||
|
{ |
||||||
|
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); |
||||||
|
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); |
||||||
|
__m256 x0, x1; |
||||||
|
x0 = _mm256_load_ps(inwptr); |
||||||
|
x1 = _mm256_load_ps(inwptr + 8); |
||||||
|
s00 = _mm256_fmadd_ps(w0, x0, s00); |
||||||
|
s01 = _mm256_fmadd_ps(w0, x1, s01); |
||||||
|
s10 = _mm256_fmadd_ps(w1, x0, s10); |
||||||
|
s11 = _mm256_fmadd_ps(w1, x1, s11); |
||||||
|
s20 = _mm256_fmadd_ps(w2, x0, s20); |
||||||
|
s21 = _mm256_fmadd_ps(w2, x1, s21); |
||||||
|
s30 = _mm256_fmadd_ps(w3, x0, s30); |
||||||
|
s31 = _mm256_fmadd_ps(w3, x1, s31); |
||||||
|
x0 = _mm256_load_ps(inwptr + 16); |
||||||
|
x1 = _mm256_load_ps(inwptr + 24); |
||||||
|
s02 = _mm256_fmadd_ps(w0, x0, s02); |
||||||
|
s03 = _mm256_fmadd_ps(w0, x1, s03); |
||||||
|
s12 = _mm256_fmadd_ps(w1, x0, s12); |
||||||
|
s13 = _mm256_fmadd_ps(w1, x1, s13); |
||||||
|
s22 = _mm256_fmadd_ps(w2, x0, s22); |
||||||
|
s23 = _mm256_fmadd_ps(w2, x1, s23); |
||||||
|
s32 = _mm256_fmadd_ps(w3, x0, s32); |
||||||
|
s33 = _mm256_fmadd_ps(w3, x1, s33); |
||||||
|
x0 = _mm256_load_ps(inwptr + 32); |
||||||
|
x1 = _mm256_load_ps(inwptr + 40); |
||||||
|
s04 = _mm256_fmadd_ps(w0, x0, s04); |
||||||
|
s05 = _mm256_fmadd_ps(w0, x1, s05); |
||||||
|
s14 = _mm256_fmadd_ps(w1, x0, s14); |
||||||
|
s15 = _mm256_fmadd_ps(w1, x1, s15); |
||||||
|
s24 = _mm256_fmadd_ps(w2, x0, s24); |
||||||
|
s25 = _mm256_fmadd_ps(w2, x1, s25); |
||||||
|
s34 = _mm256_fmadd_ps(w3, x0, s34); |
||||||
|
s35 = _mm256_fmadd_ps(w3, x1, s35); |
||||||
|
} |
||||||
|
|
||||||
|
_mm256_store_ps(outbuf, s00); |
||||||
|
_mm256_store_ps(outbuf + 1*64, s01); |
||||||
|
_mm256_store_ps(outbuf + 2*64, s02); |
||||||
|
_mm256_store_ps(outbuf + 3*64, s03); |
||||||
|
_mm256_store_ps(outbuf + 4*64, s04); |
||||||
|
_mm256_store_ps(outbuf + 5*64, s05); |
||||||
|
|
||||||
|
_mm256_store_ps(outbuf + 6*64, s10); |
||||||
|
_mm256_store_ps(outbuf + 7*64, s11); |
||||||
|
_mm256_store_ps(outbuf + 8*64, s12); |
||||||
|
_mm256_store_ps(outbuf + 9*64, s13); |
||||||
|
_mm256_store_ps(outbuf + 10*64, s14); |
||||||
|
_mm256_store_ps(outbuf + 11*64, s15); |
||||||
|
|
||||||
|
_mm256_store_ps(outbuf + 12*64, s20); |
||||||
|
_mm256_store_ps(outbuf + 13*64, s21); |
||||||
|
_mm256_store_ps(outbuf + 14*64, s22); |
||||||
|
_mm256_store_ps(outbuf + 15*64, s23); |
||||||
|
_mm256_store_ps(outbuf + 16*64, s24); |
||||||
|
_mm256_store_ps(outbuf + 17*64, s25); |
||||||
|
|
||||||
|
_mm256_store_ps(outbuf + 18*64, s30); |
||||||
|
_mm256_store_ps(outbuf + 19*64, s31); |
||||||
|
_mm256_store_ps(outbuf + 20*64, s32); |
||||||
|
_mm256_store_ps(outbuf + 21*64, s33); |
||||||
|
_mm256_store_ps(outbuf + 22*64, s34); |
||||||
|
_mm256_store_ps(outbuf + 23*64, s35); |
||||||
|
} |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
||||||
|
outbuf += winoAtomF32) |
||||||
|
{ |
||||||
|
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00; |
||||||
|
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00; |
||||||
|
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00; |
||||||
|
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00; |
||||||
|
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
||||||
|
wptr += winoKblock*winoAtomF32) { |
||||||
|
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); |
||||||
|
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); |
||||||
|
__m256 x0, x1, x2; |
||||||
|
x0 = _mm256_load_ps(inwptr); |
||||||
|
x1 = _mm256_load_ps(inwptr + 8); |
||||||
|
x2 = _mm256_load_ps(inwptr + 16); |
||||||
|
s00 = _mm256_fmadd_ps(w0, x0, s00); |
||||||
|
s01 = _mm256_fmadd_ps(w0, x1, s01); |
||||||
|
s02 = _mm256_fmadd_ps(w0, x2, s02); |
||||||
|
s10 = _mm256_fmadd_ps(w1, x0, s10); |
||||||
|
s11 = _mm256_fmadd_ps(w1, x1, s11); |
||||||
|
s12 = _mm256_fmadd_ps(w1, x2, s12); |
||||||
|
s20 = _mm256_fmadd_ps(w2, x0, s20); |
||||||
|
s21 = _mm256_fmadd_ps(w2, x1, s21); |
||||||
|
s22 = _mm256_fmadd_ps(w2, x2, s22); |
||||||
|
s30 = _mm256_fmadd_ps(w3, x0, s30); |
||||||
|
s31 = _mm256_fmadd_ps(w3, x1, s31); |
||||||
|
s32 = _mm256_fmadd_ps(w3, x2, s32); |
||||||
|
} |
||||||
|
|
||||||
|
_mm256_store_ps(outbuf, s00); |
||||||
|
_mm256_store_ps(outbuf + 1*64, s01); |
||||||
|
_mm256_store_ps(outbuf + 2*64, s02); |
||||||
|
_mm256_store_ps(outbuf + 6*64, s10); |
||||||
|
_mm256_store_ps(outbuf + 7*64, s11); |
||||||
|
_mm256_store_ps(outbuf + 8*64, s12); |
||||||
|
_mm256_store_ps(outbuf + 12*64, s20); |
||||||
|
_mm256_store_ps(outbuf + 13*64, s21); |
||||||
|
_mm256_store_ps(outbuf + 14*64, s22); |
||||||
|
_mm256_store_ps(outbuf + 18*64, s30); |
||||||
|
_mm256_store_ps(outbuf + 19*64, s31); |
||||||
|
_mm256_store_ps(outbuf + 20*64, s32); |
||||||
|
} |
||||||
|
} |
||||||
|
_mm256_zeroupper(); |
||||||
|
} |
||||||
|
static inline |
||||||
|
void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) |
||||||
|
{ |
||||||
|
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; |
||||||
|
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; |
||||||
|
__t0 = _mm256_unpacklo_ps(row0, row1); |
||||||
|
__t1 = _mm256_unpackhi_ps(row0, row1); |
||||||
|
__t2 = _mm256_unpacklo_ps(row2, row3); |
||||||
|
__t3 = _mm256_unpackhi_ps(row2, row3); |
||||||
|
__t4 = _mm256_unpacklo_ps(row4, row5); |
||||||
|
__t5 = _mm256_unpackhi_ps(row4, row5); |
||||||
|
__t6 = _mm256_unpacklo_ps(row6, row7); |
||||||
|
__t7 = _mm256_unpackhi_ps(row6, row7); |
||||||
|
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); |
||||||
|
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); |
||||||
|
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); |
||||||
|
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); |
||||||
|
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); |
||||||
|
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); |
||||||
|
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); |
||||||
|
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); |
||||||
|
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); |
||||||
|
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); |
||||||
|
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); |
||||||
|
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); |
||||||
|
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); |
||||||
|
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); |
||||||
|
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); |
||||||
|
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); |
||||||
|
} |
||||||
|
|
||||||
|
/*Input transform*/ |
||||||
|
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* outptr, int Cg, const int winoIblock, const int winoAtomF32) |
||||||
|
{ |
||||||
|
__m256 x00 = _mm256_loadu_ps(inptr); |
||||||
|
__m256 x10 = _mm256_loadu_ps(inptr + inpstep); |
||||||
|
__m256 x20 = _mm256_loadu_ps(inptr + inpstep*2); |
||||||
|
__m256 x30 = _mm256_loadu_ps(inptr + inpstep*3); |
||||||
|
__m256 x40 = _mm256_loadu_ps(inptr + inpstep*4); |
||||||
|
__m256 x50 = _mm256_loadu_ps(inptr + inpstep*5); |
||||||
|
__m256 x60 = _mm256_loadu_ps(inptr + inpstep*6); |
||||||
|
__m256 x70 = _mm256_loadu_ps(inptr + inpstep*7); |
||||||
|
|
||||||
|
__m256 z00, z10, z20, z30, z40, z50, z60, z70; |
||||||
|
|
||||||
|
{ |
||||||
|
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ |
||||||
|
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ |
||||||
|
__m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10; |
||||||
|
t00 = _mm256_sub_ps(x40, x20); |
||||||
|
t10 = _mm256_sub_ps(x30, x50); |
||||||
|
|
||||||
|
__m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60)); |
||||||
|
__m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10)); |
||||||
|
|
||||||
|
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ |
||||||
|
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ |
||||||
|
__m256 qm4_25 = _mm256_set1_ps(-4.25f); |
||||||
|
t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50)); |
||||||
|
t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60)); |
||||||
|
|
||||||
|
__m256 y10 = _mm256_add_ps(t00, t10); |
||||||
|
__m256 y20 = _mm256_sub_ps(t10, t00); |
||||||
|
|
||||||
|
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ |
||||||
|
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ |
||||||
|
__m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f); |
||||||
|
__m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f); |
||||||
|
t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50)); |
||||||
|
t10 = _mm256_fmadd_ps(x20, q0_25, x60); |
||||||
|
t00 = _mm256_fmadd_ps(x30, qm2_5, t00); |
||||||
|
t10 = _mm256_fmadd_ps(x40, qm1_25, t10); |
||||||
|
|
||||||
|
__m256 y30 = _mm256_add_ps(t00, t10); |
||||||
|
__m256 y40 = _mm256_sub_ps(t10, t00); |
||||||
|
|
||||||
|
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ |
||||||
|
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ |
||||||
|
__m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f); |
||||||
|
t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10)); |
||||||
|
t10 = _mm256_fmadd_ps(x20, q4 , x60); |
||||||
|
t00 = _mm256_fmadd_ps(x30, qm2_5, t00); |
||||||
|
t10 = _mm256_fmadd_ps(x40, qm5 , t10); |
||||||
|
|
||||||
|
__m256 y50 = _mm256_add_ps(t00, t10); |
||||||
|
__m256 y60 = _mm256_sub_ps(t10, t00); |
||||||
|
|
||||||
|
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
||||||
|
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); |
||||||
|
|
||||||
|
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ |
||||||
|
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ |
||||||
|
t00 = _mm256_sub_ps(y40, y20); |
||||||
|
t10 = _mm256_sub_ps(y30, y50); |
||||||
|
z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60)); |
||||||
|
z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10)); |
||||||
|
|
||||||
|
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ |
||||||
|
t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50)); |
||||||
|
t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60)); |
||||||
|
z10 = _mm256_add_ps(t00, t10); |
||||||
|
z20 = _mm256_sub_ps(t10, t00); |
||||||
|
|
||||||
|
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ |
||||||
|
t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50)); |
||||||
|
t10 = _mm256_fmadd_ps(y20, q0_25, y60); |
||||||
|
t00 = _mm256_fmadd_ps(y30, qm2_5, t00); |
||||||
|
t10 = _mm256_fmadd_ps(y40, qm1_25, t10); |
||||||
|
|
||||||
|
z30 = _mm256_add_ps(t00, t10); |
||||||
|
z40 = _mm256_sub_ps(t10, t00); |
||||||
|
|
||||||
|
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ |
||||||
|
t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10)); |
||||||
|
t10 = _mm256_fmadd_ps(y20, q4, y60); |
||||||
|
t00 = _mm256_fmadd_ps(y30, qm2_5, t00); |
||||||
|
t10 = _mm256_fmadd_ps(y40, qm5, t10); |
||||||
|
|
||||||
|
z50 = _mm256_add_ps(t00, t10); |
||||||
|
z60 = _mm256_sub_ps(t10, t00); |
||||||
|
} |
||||||
|
|
||||||
|
const int outstep = winoIblock*winoAtomF32*Cg; |
||||||
|
|
||||||
|
_mm256_storeu_ps(outptr, z00); |
||||||
|
_mm256_storeu_ps(outptr + outstep, z10); |
||||||
|
_mm256_storeu_ps(outptr + outstep*2, z20); |
||||||
|
_mm256_storeu_ps(outptr + outstep*3, z30); |
||||||
|
_mm256_storeu_ps(outptr + outstep*4, z40); |
||||||
|
_mm256_storeu_ps(outptr + outstep*5, z50); |
||||||
|
_mm256_storeu_ps(outptr + outstep*6, z60); |
||||||
|
_mm256_storeu_ps(outptr + outstep*7, z70); |
||||||
|
_mm256_zeroupper(); |
||||||
|
} |
||||||
|
|
||||||
|
#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \ |
||||||
|
lowM = _mm256_castps256_ps128(z00); \
|
||||||
|
highM = _mm256_extractf128_ps(z00, 1); \
|
||||||
|
_mm_storeu_ps(ptr, lowM); \
|
||||||
|
_mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM)) |
||||||
|
|
||||||
|
/* Inverse Winograd 8x8 transform:
|
||||||
|
out = (A'*inp*A)', where |
||||||
|
inp is input 8x8 FP32 matrix, |
||||||
|
A' is |
||||||
|
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, |
||||||
|
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, |
||||||
|
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, |
||||||
|
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, |
||||||
|
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, |
||||||
|
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] |
||||||
|
*/ |
||||||
|
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* bpptr, int bpstep, float* outptr, int outstep, |
||||||
|
float bias, float minval, float maxval, bool ifMinMaxAct) |
||||||
|
{ |
||||||
|
|
||||||
|
__m256 x00 = _mm256_load_ps(inptr); |
||||||
|
__m256 x10 = _mm256_load_ps(inptr + inpstep); |
||||||
|
__m256 x20 = _mm256_load_ps(inptr + inpstep*2); |
||||||
|
__m256 x30 = _mm256_load_ps(inptr + inpstep*3); |
||||||
|
__m256 x40 = _mm256_load_ps(inptr + inpstep*4); |
||||||
|
__m256 x50 = _mm256_load_ps(inptr + inpstep*5); |
||||||
|
__m256 x60 = _mm256_load_ps(inptr + inpstep*6); |
||||||
|
__m256 x70 = _mm256_load_ps(inptr + inpstep*7); |
||||||
|
__m256 z00, z10, z20, z30, z40, z50; |
||||||
|
|
||||||
|
{ |
||||||
|
__m256 s12_0, s34_0, s56_0; |
||||||
|
s12_0 = _mm256_add_ps(x10, x20); |
||||||
|
s34_0 = _mm256_add_ps(x30, x40); |
||||||
|
s56_0 = _mm256_add_ps(x50, x60); |
||||||
|
|
||||||
|
__m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); |
||||||
|
__m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); |
||||||
|
__m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); |
||||||
|
|
||||||
|
s12_0 = _mm256_sub_ps(x10, x20); |
||||||
|
s34_0 = _mm256_sub_ps(x30, x40); |
||||||
|
s56_0 = _mm256_sub_ps(x50, x60); |
||||||
|
__m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0))); |
||||||
|
__m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0)); |
||||||
|
__m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0)); |
||||||
|
__m256 y60 = _mm256_set1_ps(0.f), y70 = y60; |
||||||
|
|
||||||
|
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
||||||
|
|
||||||
|
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); |
||||||
|
|
||||||
|
s12_0 = _mm256_add_ps(y10, y20); |
||||||
|
s34_0 = _mm256_add_ps(y30, y40); |
||||||
|
s56_0 = _mm256_add_ps(y50, y60); |
||||||
|
|
||||||
|
z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); |
||||||
|
z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); |
||||||
|
z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); |
||||||
|
|
||||||
|
s12_0 = _mm256_sub_ps(y10, y20); |
||||||
|
s34_0 = _mm256_sub_ps(y30, y40); |
||||||
|
s56_0 = _mm256_sub_ps(y50, y60); |
||||||
|
|
||||||
|
z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0))); |
||||||
|
z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0)); |
||||||
|
z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0)); |
||||||
|
|
||||||
|
__m256 vbias = _mm256_set1_ps(bias); |
||||||
|
z00 = _mm256_add_ps(vbias, z00); |
||||||
|
z10 = _mm256_add_ps(vbias, z10); |
||||||
|
z20 = _mm256_add_ps(vbias, z20); |
||||||
|
z30 = _mm256_add_ps(vbias, z30); |
||||||
|
z40 = _mm256_add_ps(vbias, z40); |
||||||
|
z50 = _mm256_add_ps(vbias, z50); |
||||||
|
} |
||||||
|
|
||||||
|
if (bpptr) |
||||||
|
{ |
||||||
|
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr)); |
||||||
|
z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep)); |
||||||
|
z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2)); |
||||||
|
z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3)); |
||||||
|
z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4)); |
||||||
|
z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5)); |
||||||
|
} |
||||||
|
|
||||||
|
if (ifMinMaxAct) |
||||||
|
{ |
||||||
|
__m256 vmax = _mm256_set1_ps(maxval); |
||||||
|
__m256 vmin = _mm256_set1_ps(minval); |
||||||
|
|
||||||
|
z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax); |
||||||
|
z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax); |
||||||
|
z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax); |
||||||
|
z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax); |
||||||
|
z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax); |
||||||
|
z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax); |
||||||
|
} |
||||||
|
|
||||||
|
__m128 lowM, highM; |
||||||
|
STORE6_ELE_FROM_16(outptr, z00, lowM, highM); |
||||||
|
STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM); |
||||||
|
STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM); |
||||||
|
STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM); |
||||||
|
STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM); |
||||||
|
STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM); |
||||||
|
_mm256_zeroupper(); |
||||||
|
} |
||||||
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
|
CV_CPU_OPTIMIZATION_NAMESPACE_END |
||||||
|
|
||||||
|
// NEON code work around.
|
||||||
|
namespace opt_NEON |
||||||
|
{ |
||||||
|
|
||||||
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && CV_NEON_AARCH64 |
||||||
|
/* Accumulate */ |
||||||
|
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
||||||
|
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); |
||||||
|
|
||||||
|
/*Input transform*/ |
||||||
|
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* outptr, int Cg, const int winoIblock, const int winoAtomF32); |
||||||
|
|
||||||
|
/*Output transform*/ |
||||||
|
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* bpptr, int bpstep, float* outptr, int outstep, |
||||||
|
float bias, float minval, float maxval, bool ifMinMaxAct); |
||||||
|
|
||||||
|
void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
||||||
|
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) |
||||||
|
{ |
||||||
|
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4); |
||||||
|
if (iblock > 3) |
||||||
|
{ |
||||||
|
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
||||||
|
outbuf += winoAtomF32) |
||||||
|
{ |
||||||
|
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; |
||||||
|
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; |
||||||
|
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; |
||||||
|
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; |
||||||
|
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
||||||
|
wptr += winoKblock*winoAtomF32) { |
||||||
|
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); |
||||||
|
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); |
||||||
|
float32x4_t x0, x1; |
||||||
|
x0 = vld1q_f32(inwptr); |
||||||
|
x1 = vld1q_f32(inwptr + 4); |
||||||
|
s00 = vfmaq_f32(s00, w0, x0); |
||||||
|
s01 = vfmaq_f32(s01, w0, x1); |
||||||
|
s10 = vfmaq_f32(s10, w1, x0); |
||||||
|
s11 = vfmaq_f32(s11, w1, x1); |
||||||
|
s20 = vfmaq_f32(s20, w2, x0); |
||||||
|
s21 = vfmaq_f32(s21, w2, x1); |
||||||
|
s30 = vfmaq_f32(s30, w3, x0); |
||||||
|
s31 = vfmaq_f32(s31, w3, x1); |
||||||
|
x0 = vld1q_f32(inwptr + 8); |
||||||
|
x1 = vld1q_f32(inwptr + 12); |
||||||
|
s02 = vfmaq_f32(s02, w0, x0); |
||||||
|
s03 = vfmaq_f32(s03, w0, x1); |
||||||
|
s12 = vfmaq_f32(s12, w1, x0); |
||||||
|
s13 = vfmaq_f32(s13, w1, x1); |
||||||
|
s22 = vfmaq_f32(s22, w2, x0); |
||||||
|
s23 = vfmaq_f32(s23, w2, x1); |
||||||
|
s32 = vfmaq_f32(s32, w3, x0); |
||||||
|
s33 = vfmaq_f32(s33, w3, x1); |
||||||
|
x0 = vld1q_f32(inwptr + 16); |
||||||
|
x1 = vld1q_f32(inwptr + 20); |
||||||
|
s04 = vfmaq_f32(s04, w0, x0); |
||||||
|
s05 = vfmaq_f32(s05, w0, x1); |
||||||
|
s14 = vfmaq_f32(s14, w1, x0); |
||||||
|
s15 = vfmaq_f32(s15, w1, x1); |
||||||
|
s24 = vfmaq_f32(s24, w2, x0); |
||||||
|
s25 = vfmaq_f32(s25, w2, x1); |
||||||
|
s34 = vfmaq_f32(s34, w3, x0); |
||||||
|
s35 = vfmaq_f32(s35, w3, x1); |
||||||
|
} |
||||||
|
|
||||||
|
vst1q_f32(outbuf, s00); |
||||||
|
vst1q_f32(outbuf + 1*64, s01); |
||||||
|
vst1q_f32(outbuf + 2*64, s02); |
||||||
|
vst1q_f32(outbuf + 3*64, s03); |
||||||
|
vst1q_f32(outbuf + 4*64, s04); |
||||||
|
vst1q_f32(outbuf + 5*64, s05); |
||||||
|
|
||||||
|
vst1q_f32(outbuf + 6*64, s10); |
||||||
|
vst1q_f32(outbuf + 7*64, s11); |
||||||
|
vst1q_f32(outbuf + 8*64, s12); |
||||||
|
vst1q_f32(outbuf + 9*64, s13); |
||||||
|
vst1q_f32(outbuf + 10*64, s14); |
||||||
|
vst1q_f32(outbuf + 11*64, s15); |
||||||
|
|
||||||
|
vst1q_f32(outbuf + 12*64, s20); |
||||||
|
vst1q_f32(outbuf + 13*64, s21); |
||||||
|
vst1q_f32(outbuf + 14*64, s22); |
||||||
|
vst1q_f32(outbuf + 15*64, s23); |
||||||
|
vst1q_f32(outbuf + 16*64, s24); |
||||||
|
vst1q_f32(outbuf + 17*64, s25); |
||||||
|
|
||||||
|
vst1q_f32(outbuf + 18*64, s30); |
||||||
|
vst1q_f32(outbuf + 19*64, s31); |
||||||
|
vst1q_f32(outbuf + 20*64, s32); |
||||||
|
vst1q_f32(outbuf + 21*64, s33); |
||||||
|
vst1q_f32(outbuf + 22*64, s34); |
||||||
|
vst1q_f32(outbuf + 23*64, s35); |
||||||
|
} |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
||||||
|
outbuf += winoAtomF32) |
||||||
|
{ |
||||||
|
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00; |
||||||
|
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00; |
||||||
|
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00; |
||||||
|
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00; |
||||||
|
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
||||||
|
wptr += winoKblock*winoAtomF32) { |
||||||
|
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); |
||||||
|
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); |
||||||
|
float32x4_t x0, x1, x2; |
||||||
|
x0 = vld1q_f32(inwptr); |
||||||
|
x1 = vld1q_f32(inwptr + 4); |
||||||
|
x2 = vld1q_f32(inwptr + 8); |
||||||
|
s00 = vfmaq_f32(s00, w0, x0); |
||||||
|
s01 = vfmaq_f32(s01, w0, x1); |
||||||
|
s02 = vfmaq_f32(s02, w0, x2); |
||||||
|
s10 = vfmaq_f32(s10, w1, x0); |
||||||
|
s11 = vfmaq_f32(s11, w1, x1); |
||||||
|
s12 = vfmaq_f32(s12, w1, x2); |
||||||
|
s20 = vfmaq_f32(s20, w2, x0); |
||||||
|
s21 = vfmaq_f32(s21, w2, x1); |
||||||
|
s22 = vfmaq_f32(s22, w2, x2); |
||||||
|
s30 = vfmaq_f32(s30, w3, x0); |
||||||
|
s31 = vfmaq_f32(s31, w3, x1); |
||||||
|
s32 = vfmaq_f32(s32, w3, x2); |
||||||
|
} |
||||||
|
|
||||||
|
vst1q_f32(outbuf, s00); |
||||||
|
vst1q_f32(outbuf + 1*64, s01); |
||||||
|
vst1q_f32(outbuf + 2*64, s02); |
||||||
|
vst1q_f32(outbuf + 6*64, s10); |
||||||
|
vst1q_f32(outbuf + 7*64, s11); |
||||||
|
vst1q_f32(outbuf + 8*64, s12); |
||||||
|
vst1q_f32(outbuf + 12*64, s20); |
||||||
|
vst1q_f32(outbuf + 13*64, s21); |
||||||
|
vst1q_f32(outbuf + 14*64, s22); |
||||||
|
vst1q_f32(outbuf + 18*64, s30); |
||||||
|
vst1q_f32(outbuf + 19*64, s31); |
||||||
|
vst1q_f32(outbuf + 20*64, s32); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#define T4x4(a, b, c, d, tr0, tr1) \ |
||||||
|
tr0 = vtrnq_f32(a, b); \
|
||||||
|
tr1 = vtrnq_f32(c, d); \
|
||||||
|
a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
|
||||||
|
b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
|
||||||
|
c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
|
||||||
|
d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) |
||||||
|
|
||||||
|
/*Input transform*/ |
||||||
|
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* outptr, int Cg, const int winoIblock, const int winoAtomF32) |
||||||
|
{ |
||||||
|
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); |
||||||
|
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); |
||||||
|
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); |
||||||
|
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); |
||||||
|
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); |
||||||
|
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); |
||||||
|
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); |
||||||
|
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); |
||||||
|
|
||||||
|
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; |
||||||
|
|
||||||
|
{ |
||||||
|
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ |
||||||
|
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ |
||||||
|
float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11; |
||||||
|
t00 = vsubq_f32(x40, x20); |
||||||
|
t01 = vsubq_f32(x41, x21); |
||||||
|
t10 = vsubq_f32(x30, x50); |
||||||
|
t11 = vsubq_f32(x31, x51); |
||||||
|
float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25); |
||||||
|
float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25); |
||||||
|
float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25); |
||||||
|
float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25); |
||||||
|
|
||||||
|
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ |
||||||
|
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ |
||||||
|
float32x4_t qm4_25 = vdupq_n_f32(-4.25f); |
||||||
|
t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25); |
||||||
|
t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25); |
||||||
|
t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25); |
||||||
|
t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25); |
||||||
|
|
||||||
|
float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11); |
||||||
|
float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01); |
||||||
|
|
||||||
|
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ |
||||||
|
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ |
||||||
|
float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f); |
||||||
|
float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f); |
||||||
|
t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5); |
||||||
|
t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5); |
||||||
|
t10 = vfmaq_f32(x60, x20, q0_25); |
||||||
|
t11 = vfmaq_f32(x61, x21, q0_25); |
||||||
|
t00 = vfmaq_f32(t00, x30, qm2_5); |
||||||
|
t01 = vfmaq_f32(t01, x31, qm2_5); |
||||||
|
t10 = vfmaq_f32(t10, x40, qm1_25); |
||||||
|
t11 = vfmaq_f32(t11, x41, qm1_25); |
||||||
|
|
||||||
|
float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11); |
||||||
|
float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01); |
||||||
|
|
||||||
|
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ |
||||||
|
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ |
||||||
|
float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f); |
||||||
|
t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5); |
||||||
|
t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5); |
||||||
|
t10 = vfmaq_f32(x60, x20, q4); |
||||||
|
t11 = vfmaq_f32(x61, x21, q4); |
||||||
|
t00 = vfmaq_f32(t00, x30, qm2_5); |
||||||
|
t01 = vfmaq_f32(t01, x31, qm2_5); |
||||||
|
t10 = vfmaq_f32(t10, x40, qm5); |
||||||
|
t11 = vfmaq_f32(t11, x41, qm5); |
||||||
|
|
||||||
|
float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11); |
||||||
|
float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01); |
||||||
|
|
||||||
|
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
||||||
|
/* Y: */ |
||||||
|
/* y00 y01 */ |
||||||
|
/* y10 y11 */ |
||||||
|
/* ... */ |
||||||
|
/* y70 y71 */ |
||||||
|
/* Y': */ |
||||||
|
/* y00 y40 */ |
||||||
|
/* y10 y50 */ |
||||||
|
/* y20 y60 */ |
||||||
|
/* y30 y70 */ |
||||||
|
/* y01 y41 */ |
||||||
|
/* y11 y51 */ |
||||||
|
/* y21 y61 */ |
||||||
|
/* y31 y71 */ |
||||||
|
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ |
||||||
|
float32x4x2_t tr0, tr1; |
||||||
|
|
||||||
|
T4x4(y00, y10, y20, y30, tr0, tr1); |
||||||
|
T4x4(y01, y11, y21, y31, tr0, tr1); |
||||||
|
T4x4(y40, y50, y60, y70, tr0, tr1); |
||||||
|
T4x4(y41, y51, y61, y71, tr0, tr1); |
||||||
|
|
||||||
|
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ |
||||||
|
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ |
||||||
|
t00 = vsubq_f32(y01, y20); |
||||||
|
t01 = vsubq_f32(y41, y60); |
||||||
|
t10 = vsubq_f32(y30, y11); |
||||||
|
t11 = vsubq_f32(y70, y51); |
||||||
|
z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25); |
||||||
|
z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25); |
||||||
|
z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25); |
||||||
|
z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25); |
||||||
|
|
||||||
|
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ |
||||||
|
t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25); |
||||||
|
t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25); |
||||||
|
t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25); |
||||||
|
t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25); |
||||||
|
|
||||||
|
z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11); |
||||||
|
z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01); |
||||||
|
|
||||||
|
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ |
||||||
|
t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5); |
||||||
|
t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5); |
||||||
|
t10 = vfmaq_f32(y21, y20, q0_25); |
||||||
|
t11 = vfmaq_f32(y61, y60, q0_25); |
||||||
|
t00 = vfmaq_f32(t00, y30, qm2_5); |
||||||
|
t01 = vfmaq_f32(t01, y70, qm2_5); |
||||||
|
t10 = vfmaq_f32(t10, y01, qm1_25); |
||||||
|
t11 = vfmaq_f32(t11, y41, qm1_25); |
||||||
|
|
||||||
|
z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11); |
||||||
|
z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01); |
||||||
|
|
||||||
|
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ |
||||||
|
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ |
||||||
|
t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5); |
||||||
|
t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5); |
||||||
|
t10 = vfmaq_f32(y21, y20, q4); |
||||||
|
t11 = vfmaq_f32(y61, y60, q4); |
||||||
|
t00 = vfmaq_f32(t00, y30, qm2_5); |
||||||
|
t01 = vfmaq_f32(t01, y70, qm2_5); |
||||||
|
t10 = vfmaq_f32(t10, y01, qm5); |
||||||
|
t11 = vfmaq_f32(t11, y41, qm5); |
||||||
|
|
||||||
|
z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11); |
||||||
|
z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01); |
||||||
|
} |
||||||
|
|
||||||
|
const int outstep = winoIblock*winoAtomF32*Cg; |
||||||
|
|
||||||
|
vst1q_f32(outptr, z00); |
||||||
|
vst1q_f32(outptr + outstep, z01); |
||||||
|
vst1q_f32(outptr + outstep*2, z10); |
||||||
|
vst1q_f32(outptr + outstep*3, z11); |
||||||
|
vst1q_f32(outptr + outstep*4, z20); |
||||||
|
vst1q_f32(outptr + outstep*5, z21); |
||||||
|
vst1q_f32(outptr + outstep*6, z30); |
||||||
|
vst1q_f32(outptr + outstep*7, z31); |
||||||
|
vst1q_f32(outptr + outstep*8, z40); |
||||||
|
vst1q_f32(outptr + outstep*9, z41); |
||||||
|
vst1q_f32(outptr + outstep*10, z50); |
||||||
|
vst1q_f32(outptr + outstep*11, z51); |
||||||
|
vst1q_f32(outptr + outstep*12, z60); |
||||||
|
vst1q_f32(outptr + outstep*13, z61); |
||||||
|
vst1q_f32(outptr + outstep*14, z70); |
||||||
|
vst1q_f32(outptr + outstep*15, z71); |
||||||
|
} |
||||||
|
|
||||||
|
/*Output transform*/ |
||||||
|
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, |
||||||
|
float* bpptr, int bpstep, float* outptr, int outstep, |
||||||
|
float bias, float minval, float maxval, bool ifMinMaxAct) |
||||||
|
{ |
||||||
|
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); |
||||||
|
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); |
||||||
|
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); |
||||||
|
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); |
||||||
|
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); |
||||||
|
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); |
||||||
|
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); |
||||||
|
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); |
||||||
|
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; |
||||||
|
|
||||||
|
{ |
||||||
|
float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; |
||||||
|
s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21); |
||||||
|
s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41); |
||||||
|
s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61); |
||||||
|
|
||||||
|
float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0); |
||||||
|
float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1); |
||||||
|
float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); |
||||||
|
float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); |
||||||
|
float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); |
||||||
|
float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); |
||||||
|
|
||||||
|
s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21); |
||||||
|
s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41); |
||||||
|
s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61); |
||||||
|
|
||||||
|
float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0), |
||||||
|
s34_0, 32.f), s56_0, 1.f/32); |
||||||
|
float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1), |
||||||
|
s34_1, 32.f), s56_1, 1.f/32); |
||||||
|
float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); |
||||||
|
float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); |
||||||
|
float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); |
||||||
|
float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); |
||||||
|
float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60; |
||||||
|
|
||||||
|
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
||||||
|
/* Y: */ |
||||||
|
/* y00 y01 */ |
||||||
|
/* y10 y11 */ |
||||||
|
/* ... */ |
||||||
|
/* y50 y51 */ |
||||||
|
/* 0 0 */ |
||||||
|
/* 0 0 */ |
||||||
|
/* Y': */ |
||||||
|
/* y00 y40 */ |
||||||
|
/* y10 y50 */ |
||||||
|
/* y20 y60 */ |
||||||
|
/* y30 y70 */ |
||||||
|
/* y01 y41 */ |
||||||
|
/* y11 y51 */ |
||||||
|
/* y21 y61 */ |
||||||
|
/* y31 y71 */ |
||||||
|
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ |
||||||
|
float32x4x2_t tr0, tr1; |
||||||
|
|
||||||
|
T4x4(y00, y10, y20, y30, tr0, tr1); |
||||||
|
T4x4(y01, y11, y21, y31, tr0, tr1); |
||||||
|
T4x4(y40, y50, y60, y70, tr0, tr1); |
||||||
|
T4x4(y41, y51, y61, y71, tr0, tr1); |
||||||
|
|
||||||
|
s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60); |
||||||
|
s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41); |
||||||
|
s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61); |
||||||
|
|
||||||
|
z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0); |
||||||
|
z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1); |
||||||
|
z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); |
||||||
|
z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); |
||||||
|
z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); |
||||||
|
z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); |
||||||
|
|
||||||
|
s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60); |
||||||
|
s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41); |
||||||
|
s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61); |
||||||
|
|
||||||
|
z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0), |
||||||
|
s34_0, 32.f), s56_0, 1.f/32); |
||||||
|
z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1), |
||||||
|
s34_1, 32.f), s56_1, 1.f/32); |
||||||
|
z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); |
||||||
|
z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); |
||||||
|
z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); |
||||||
|
z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); |
||||||
|
float32x4_t vbias = vdupq_n_f32(bias); |
||||||
|
|
||||||
|
z00 = vaddq_f32(z00, vbias); |
||||||
|
z01 = vaddq_f32(z01, vbias); |
||||||
|
z10 = vaddq_f32(z10, vbias); |
||||||
|
z11 = vaddq_f32(z11, vbias); |
||||||
|
z20 = vaddq_f32(z20, vbias); |
||||||
|
z21 = vaddq_f32(z21, vbias); |
||||||
|
z30 = vaddq_f32(z30, vbias); |
||||||
|
z31 = vaddq_f32(z31, vbias); |
||||||
|
z40 = vaddq_f32(z40, vbias); |
||||||
|
z41 = vaddq_f32(z41, vbias); |
||||||
|
z50 = vaddq_f32(z50, vbias); |
||||||
|
z51 = vaddq_f32(z51, vbias); |
||||||
|
} |
||||||
|
|
||||||
|
if (bpptr) |
||||||
|
{ |
||||||
|
float32x2_t zhalf = vdup_n_f32(0.f); |
||||||
|
z00 = vaddq_f32(z00, vld1q_f32(bpptr)); |
||||||
|
z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf)); |
||||||
|
z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep)); |
||||||
|
z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf)); |
||||||
|
z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2)); |
||||||
|
z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf)); |
||||||
|
z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3)); |
||||||
|
z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf)); |
||||||
|
z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4)); |
||||||
|
z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf)); |
||||||
|
z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5)); |
||||||
|
z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf)); |
||||||
|
} |
||||||
|
|
||||||
|
if (ifMinMaxAct) |
||||||
|
{ |
||||||
|
float32x4_t vmax = vdupq_n_f32(maxval); |
||||||
|
float32x4_t vmin = vdupq_n_f32(minval); |
||||||
|
|
||||||
|
z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax); |
||||||
|
z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax); |
||||||
|
z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax); |
||||||
|
z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax); |
||||||
|
z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax); |
||||||
|
z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax); |
||||||
|
z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax); |
||||||
|
z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax); |
||||||
|
z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax); |
||||||
|
z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax); |
||||||
|
z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax); |
||||||
|
z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax); |
||||||
|
} |
||||||
|
|
||||||
|
vst1q_f32(outptr, z00); |
||||||
|
vst1_f32(outptr + 4, vget_low_f32(z01)); |
||||||
|
vst1q_f32(outptr + outstep, z10); |
||||||
|
vst1_f32(outptr + outstep + 4, vget_low_f32(z11)); |
||||||
|
vst1q_f32(outptr + outstep*2, z20); |
||||||
|
vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21)); |
||||||
|
vst1q_f32(outptr + outstep*3, z30); |
||||||
|
vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31)); |
||||||
|
vst1q_f32(outptr + outstep*4, z40); |
||||||
|
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); |
||||||
|
vst1q_f32(outptr + outstep*5, z50); |
||||||
|
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); |
||||||
|
} |
||||||
|
|
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
}} // namespace
|
@ -1,499 +0,0 @@ |
|||||||
// This file is part of OpenCV project.
|
|
||||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
||||||
// of this distribution and at http://opencv.org/license.html.
|
|
||||||
|
|
||||||
#include "../../precomp.hpp" |
|
||||||
#include "fast_convolution.hpp" |
|
||||||
|
|
||||||
namespace cv { |
|
||||||
namespace dnn { |
|
||||||
namespace opt_AVX2 |
|
||||||
{ |
|
||||||
#if CV_TRY_AVX2 |
|
||||||
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, |
|
||||||
const float minval, const float maxval, bool ifMinMaxAct) |
|
||||||
{ |
|
||||||
#if CONV_NR == 24 |
|
||||||
__m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0; |
|
||||||
|
|
||||||
for (int p = 0; p < np; p++, a++, b += CONV_NR) |
|
||||||
{ |
|
||||||
__m256 a0 = _mm256_set1_ps(a[0]); |
|
||||||
__m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16); |
|
||||||
|
|
||||||
c0 = _mm256_fmadd_ps(b0, a0, c0); |
|
||||||
c1 = _mm256_fmadd_ps(b1, a0, c1); |
|
||||||
c2 = _mm256_fmadd_ps(b2, a0, c2); |
|
||||||
} |
|
||||||
|
|
||||||
if (init_c) |
|
||||||
{ |
|
||||||
c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0); |
|
||||||
c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1); |
|
||||||
c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2); |
|
||||||
} |
|
||||||
|
|
||||||
if (ifMinMaxAct) |
|
||||||
{ |
|
||||||
__m256 vmax = _mm256_set1_ps(maxval); |
|
||||||
__m256 vmin = _mm256_set1_ps(minval); |
|
||||||
|
|
||||||
c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax); |
|
||||||
c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax); |
|
||||||
c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax); |
|
||||||
} |
|
||||||
|
|
||||||
_mm256_storeu_ps(c, c0); |
|
||||||
_mm256_storeu_ps(c + 8, c1); |
|
||||||
_mm256_storeu_ps(c + 16, c2); |
|
||||||
_mm256_zeroupper(); |
|
||||||
#else |
|
||||||
#error "unsupported CONV_NR in convBlockMR1." |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c) |
|
||||||
{ |
|
||||||
#if CONV_MR == 4 && CONV_NR == 24 |
|
||||||
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00; |
|
||||||
__m256 c10 = c00, c11 = c00, c12 = c00; |
|
||||||
__m256 c20 = c00, c21 = c00, c22 = c00; |
|
||||||
__m256 c30 = c00, c31 = c00, c32 = c00; |
|
||||||
|
|
||||||
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); |
|
||||||
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); |
|
||||||
|
|
||||||
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) |
|
||||||
{ |
|
||||||
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); |
|
||||||
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); |
|
||||||
|
|
||||||
c00 = _mm256_fmadd_ps(b0, a0, c00); |
|
||||||
c01 = _mm256_fmadd_ps(b1, a0, c01); |
|
||||||
c02 = _mm256_fmadd_ps(b2, a0, c02); |
|
||||||
|
|
||||||
c10 = _mm256_fmadd_ps(b0, a1, c10); |
|
||||||
c11 = _mm256_fmadd_ps(b1, a1, c11); |
|
||||||
c12 = _mm256_fmadd_ps(b2, a1, c12); |
|
||||||
|
|
||||||
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); |
|
||||||
|
|
||||||
c20 = _mm256_fmadd_ps(b0, a0, c20); |
|
||||||
c21 = _mm256_fmadd_ps(b1, a0, c21); |
|
||||||
c22 = _mm256_fmadd_ps(b2, a0, c22); |
|
||||||
|
|
||||||
c30 = _mm256_fmadd_ps(b0, a1, c30); |
|
||||||
c31 = _mm256_fmadd_ps(b1, a1, c31); |
|
||||||
c32 = _mm256_fmadd_ps(b2, a1, c32); |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
c00 = _mm256_add_ps(c00, _mm256_load_ps(c)); |
|
||||||
c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8)); |
|
||||||
c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16)); |
|
||||||
|
|
||||||
c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc)); |
|
||||||
c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8)); |
|
||||||
c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16)); |
|
||||||
|
|
||||||
c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2)); |
|
||||||
c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8)); |
|
||||||
c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16)); |
|
||||||
|
|
||||||
c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3)); |
|
||||||
c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8)); |
|
||||||
c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16)); |
|
||||||
} |
|
||||||
|
|
||||||
_mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02); |
|
||||||
_mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12); |
|
||||||
_mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22); |
|
||||||
_mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32); |
|
||||||
_mm256_zeroupper(); |
|
||||||
#else |
|
||||||
#error "unsupported CONV_MR and/or CONV_NR in convBlock_AVX2." |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, |
|
||||||
float* outbuf, int Cg, int iblock) |
|
||||||
{ |
|
||||||
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 8); |
|
||||||
if (iblock > 3) |
|
||||||
{ |
|
||||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, |
|
||||||
outbuf += _FX_WINO_ATOM_F32) |
|
||||||
{ |
|
||||||
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; |
|
||||||
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; |
|
||||||
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; |
|
||||||
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; |
|
||||||
for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, |
|
||||||
wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) |
|
||||||
{ |
|
||||||
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); |
|
||||||
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); |
|
||||||
__m256 x0, x1; |
|
||||||
x0 = _mm256_load_ps(inwptr); |
|
||||||
x1 = _mm256_load_ps(inwptr + 8); |
|
||||||
s00 = _mm256_fmadd_ps(w0, x0, s00); |
|
||||||
s01 = _mm256_fmadd_ps(w0, x1, s01); |
|
||||||
s10 = _mm256_fmadd_ps(w1, x0, s10); |
|
||||||
s11 = _mm256_fmadd_ps(w1, x1, s11); |
|
||||||
s20 = _mm256_fmadd_ps(w2, x0, s20); |
|
||||||
s21 = _mm256_fmadd_ps(w2, x1, s21); |
|
||||||
s30 = _mm256_fmadd_ps(w3, x0, s30); |
|
||||||
s31 = _mm256_fmadd_ps(w3, x1, s31); |
|
||||||
x0 = _mm256_load_ps(inwptr + 16); |
|
||||||
x1 = _mm256_load_ps(inwptr + 24); |
|
||||||
s02 = _mm256_fmadd_ps(w0, x0, s02); |
|
||||||
s03 = _mm256_fmadd_ps(w0, x1, s03); |
|
||||||
s12 = _mm256_fmadd_ps(w1, x0, s12); |
|
||||||
s13 = _mm256_fmadd_ps(w1, x1, s13); |
|
||||||
s22 = _mm256_fmadd_ps(w2, x0, s22); |
|
||||||
s23 = _mm256_fmadd_ps(w2, x1, s23); |
|
||||||
s32 = _mm256_fmadd_ps(w3, x0, s32); |
|
||||||
s33 = _mm256_fmadd_ps(w3, x1, s33); |
|
||||||
x0 = _mm256_load_ps(inwptr + 32); |
|
||||||
x1 = _mm256_load_ps(inwptr + 40); |
|
||||||
s04 = _mm256_fmadd_ps(w0, x0, s04); |
|
||||||
s05 = _mm256_fmadd_ps(w0, x1, s05); |
|
||||||
s14 = _mm256_fmadd_ps(w1, x0, s14); |
|
||||||
s15 = _mm256_fmadd_ps(w1, x1, s15); |
|
||||||
s24 = _mm256_fmadd_ps(w2, x0, s24); |
|
||||||
s25 = _mm256_fmadd_ps(w2, x1, s25); |
|
||||||
s34 = _mm256_fmadd_ps(w3, x0, s34); |
|
||||||
s35 = _mm256_fmadd_ps(w3, x1, s35); |
|
||||||
} |
|
||||||
|
|
||||||
_mm256_store_ps(outbuf, s00); |
|
||||||
_mm256_store_ps(outbuf + 1*64, s01); |
|
||||||
_mm256_store_ps(outbuf + 2*64, s02); |
|
||||||
_mm256_store_ps(outbuf + 3*64, s03); |
|
||||||
_mm256_store_ps(outbuf + 4*64, s04); |
|
||||||
_mm256_store_ps(outbuf + 5*64, s05); |
|
||||||
|
|
||||||
_mm256_store_ps(outbuf + 6*64, s10); |
|
||||||
_mm256_store_ps(outbuf + 7*64, s11); |
|
||||||
_mm256_store_ps(outbuf + 8*64, s12); |
|
||||||
_mm256_store_ps(outbuf + 9*64, s13); |
|
||||||
_mm256_store_ps(outbuf + 10*64, s14); |
|
||||||
_mm256_store_ps(outbuf + 11*64, s15); |
|
||||||
|
|
||||||
_mm256_store_ps(outbuf + 12*64, s20); |
|
||||||
_mm256_store_ps(outbuf + 13*64, s21); |
|
||||||
_mm256_store_ps(outbuf + 14*64, s22); |
|
||||||
_mm256_store_ps(outbuf + 15*64, s23); |
|
||||||
_mm256_store_ps(outbuf + 16*64, s24); |
|
||||||
_mm256_store_ps(outbuf + 17*64, s25); |
|
||||||
|
|
||||||
_mm256_store_ps(outbuf + 18*64, s30); |
|
||||||
_mm256_store_ps(outbuf + 19*64, s31); |
|
||||||
_mm256_store_ps(outbuf + 20*64, s32); |
|
||||||
_mm256_store_ps(outbuf + 21*64, s33); |
|
||||||
_mm256_store_ps(outbuf + 22*64, s34); |
|
||||||
_mm256_store_ps(outbuf + 23*64, s35); |
|
||||||
} |
|
||||||
} |
|
||||||
else |
|
||||||
{ |
|
||||||
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, |
|
||||||
outbuf += _FX_WINO_ATOM_F32) |
|
||||||
{ |
|
||||||
__m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00; |
|
||||||
__m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00; |
|
||||||
__m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00; |
|
||||||
__m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00; |
|
||||||
for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, |
|
||||||
wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) { |
|
||||||
__m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); |
|
||||||
__m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); |
|
||||||
__m256 x0, x1, x2; |
|
||||||
x0 = _mm256_load_ps(inwptr); |
|
||||||
x1 = _mm256_load_ps(inwptr + 8); |
|
||||||
x2 = _mm256_load_ps(inwptr + 16); |
|
||||||
s00 = _mm256_fmadd_ps(w0, x0, s00); |
|
||||||
s01 = _mm256_fmadd_ps(w0, x1, s01); |
|
||||||
s02 = _mm256_fmadd_ps(w0, x2, s02); |
|
||||||
s10 = _mm256_fmadd_ps(w1, x0, s10); |
|
||||||
s11 = _mm256_fmadd_ps(w1, x1, s11); |
|
||||||
s12 = _mm256_fmadd_ps(w1, x2, s12); |
|
||||||
s20 = _mm256_fmadd_ps(w2, x0, s20); |
|
||||||
s21 = _mm256_fmadd_ps(w2, x1, s21); |
|
||||||
s22 = _mm256_fmadd_ps(w2, x2, s22); |
|
||||||
s30 = _mm256_fmadd_ps(w3, x0, s30); |
|
||||||
s31 = _mm256_fmadd_ps(w3, x1, s31); |
|
||||||
s32 = _mm256_fmadd_ps(w3, x2, s32); |
|
||||||
} |
|
||||||
|
|
||||||
_mm256_store_ps(outbuf, s00); |
|
||||||
_mm256_store_ps(outbuf + 1*64, s01); |
|
||||||
_mm256_store_ps(outbuf + 2*64, s02); |
|
||||||
_mm256_store_ps(outbuf + 6*64, s10); |
|
||||||
_mm256_store_ps(outbuf + 7*64, s11); |
|
||||||
_mm256_store_ps(outbuf + 8*64, s12); |
|
||||||
_mm256_store_ps(outbuf + 12*64, s20); |
|
||||||
_mm256_store_ps(outbuf + 13*64, s21); |
|
||||||
_mm256_store_ps(outbuf + 14*64, s22); |
|
||||||
_mm256_store_ps(outbuf + 18*64, s30); |
|
||||||
_mm256_store_ps(outbuf + 19*64, s31); |
|
||||||
_mm256_store_ps(outbuf + 20*64, s32); |
|
||||||
} |
|
||||||
} |
|
||||||
_mm256_zeroupper(); |
|
||||||
} |
|
||||||
static inline |
|
||||||
void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) |
|
||||||
{ |
|
||||||
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; |
|
||||||
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; |
|
||||||
__t0 = _mm256_unpacklo_ps(row0, row1); |
|
||||||
__t1 = _mm256_unpackhi_ps(row0, row1); |
|
||||||
__t2 = _mm256_unpacklo_ps(row2, row3); |
|
||||||
__t3 = _mm256_unpackhi_ps(row2, row3); |
|
||||||
__t4 = _mm256_unpacklo_ps(row4, row5); |
|
||||||
__t5 = _mm256_unpackhi_ps(row4, row5); |
|
||||||
__t6 = _mm256_unpacklo_ps(row6, row7); |
|
||||||
__t7 = _mm256_unpackhi_ps(row6, row7); |
|
||||||
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); |
|
||||||
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); |
|
||||||
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); |
|
||||||
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); |
|
||||||
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); |
|
||||||
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); |
|
||||||
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); |
|
||||||
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); |
|
||||||
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); |
|
||||||
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); |
|
||||||
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); |
|
||||||
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); |
|
||||||
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); |
|
||||||
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); |
|
||||||
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); |
|
||||||
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); |
|
||||||
} |
|
||||||
|
|
||||||
/*Input transform*/ |
|
||||||
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg) |
|
||||||
{ |
|
||||||
__m256 x00 = _mm256_loadu_ps(inptr); |
|
||||||
__m256 x10 = _mm256_loadu_ps(inptr + inpstep); |
|
||||||
__m256 x20 = _mm256_loadu_ps(inptr + inpstep*2); |
|
||||||
__m256 x30 = _mm256_loadu_ps(inptr + inpstep*3); |
|
||||||
__m256 x40 = _mm256_loadu_ps(inptr + inpstep*4); |
|
||||||
__m256 x50 = _mm256_loadu_ps(inptr + inpstep*5); |
|
||||||
__m256 x60 = _mm256_loadu_ps(inptr + inpstep*6); |
|
||||||
__m256 x70 = _mm256_loadu_ps(inptr + inpstep*7); |
|
||||||
|
|
||||||
__m256 z00, z10, z20, z30, z40, z50, z60, z70; |
|
||||||
|
|
||||||
{ |
|
||||||
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ |
|
||||||
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ |
|
||||||
__m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10; |
|
||||||
t00 = _mm256_sub_ps(x40, x20); |
|
||||||
t10 = _mm256_sub_ps(x30, x50); |
|
||||||
|
|
||||||
__m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60)); |
|
||||||
__m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10)); |
|
||||||
|
|
||||||
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ |
|
||||||
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ |
|
||||||
__m256 qm4_25 = _mm256_set1_ps(-4.25f); |
|
||||||
t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50)); |
|
||||||
t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60)); |
|
||||||
|
|
||||||
__m256 y10 = _mm256_add_ps(t00, t10); |
|
||||||
__m256 y20 = _mm256_sub_ps(t10, t00); |
|
||||||
|
|
||||||
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ |
|
||||||
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ |
|
||||||
__m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f); |
|
||||||
__m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f); |
|
||||||
t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50)); |
|
||||||
t10 = _mm256_fmadd_ps(x20, q0_25, x60); |
|
||||||
t00 = _mm256_fmadd_ps(x30, qm2_5, t00); |
|
||||||
t10 = _mm256_fmadd_ps(x40, qm1_25, t10); |
|
||||||
|
|
||||||
__m256 y30 = _mm256_add_ps(t00, t10); |
|
||||||
__m256 y40 = _mm256_sub_ps(t10, t00); |
|
||||||
|
|
||||||
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ |
|
||||||
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ |
|
||||||
__m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f); |
|
||||||
t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10)); |
|
||||||
t10 = _mm256_fmadd_ps(x20, q4 , x60); |
|
||||||
t00 = _mm256_fmadd_ps(x30, qm2_5, t00); |
|
||||||
t10 = _mm256_fmadd_ps(x40, qm5 , t10); |
|
||||||
|
|
||||||
__m256 y50 = _mm256_add_ps(t00, t10); |
|
||||||
__m256 y60 = _mm256_sub_ps(t10, t00); |
|
||||||
|
|
||||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
|
||||||
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); |
|
||||||
|
|
||||||
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ |
|
||||||
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ |
|
||||||
t00 = _mm256_sub_ps(y40, y20); |
|
||||||
t10 = _mm256_sub_ps(y30, y50); |
|
||||||
z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60)); |
|
||||||
z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10)); |
|
||||||
|
|
||||||
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ |
|
||||||
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ |
|
||||||
t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50)); |
|
||||||
t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60)); |
|
||||||
z10 = _mm256_add_ps(t00, t10); |
|
||||||
z20 = _mm256_sub_ps(t10, t00); |
|
||||||
|
|
||||||
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ |
|
||||||
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ |
|
||||||
t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50)); |
|
||||||
t10 = _mm256_fmadd_ps(y20, q0_25, y60); |
|
||||||
t00 = _mm256_fmadd_ps(y30, qm2_5, t00); |
|
||||||
t10 = _mm256_fmadd_ps(y40, qm1_25, t10); |
|
||||||
|
|
||||||
z30 = _mm256_add_ps(t00, t10); |
|
||||||
z40 = _mm256_sub_ps(t10, t00); |
|
||||||
|
|
||||||
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ |
|
||||||
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ |
|
||||||
t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10)); |
|
||||||
t10 = _mm256_fmadd_ps(y20, q4, y60); |
|
||||||
t00 = _mm256_fmadd_ps(y30, qm2_5, t00); |
|
||||||
t10 = _mm256_fmadd_ps(y40, qm5, t10); |
|
||||||
|
|
||||||
z50 = _mm256_add_ps(t00, t10); |
|
||||||
z60 = _mm256_sub_ps(t10, t00); |
|
||||||
} |
|
||||||
|
|
||||||
const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg; |
|
||||||
|
|
||||||
_mm256_storeu_ps(outptr, z00); |
|
||||||
_mm256_storeu_ps(outptr + outstep, z10); |
|
||||||
_mm256_storeu_ps(outptr + outstep*2, z20); |
|
||||||
_mm256_storeu_ps(outptr + outstep*3, z30); |
|
||||||
_mm256_storeu_ps(outptr + outstep*4, z40); |
|
||||||
_mm256_storeu_ps(outptr + outstep*5, z50); |
|
||||||
_mm256_storeu_ps(outptr + outstep*6, z60); |
|
||||||
_mm256_storeu_ps(outptr + outstep*7, z70); |
|
||||||
_mm256_zeroupper(); |
|
||||||
} |
|
||||||
|
|
||||||
#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \ |
|
||||||
lowM = _mm256_castps256_ps128(z00); \
|
|
||||||
highM = _mm256_extractf128_ps(z00, 1); \
|
|
||||||
_mm_storeu_ps(ptr, lowM); \
|
|
||||||
_mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM)) |
|
||||||
|
|
||||||
/* Inverse Winograd 8x8 transform:
|
|
||||||
out = (A'*inp*A)', where |
|
||||||
inp is input 8x8 FP32 matrix, |
|
||||||
A' is |
|
||||||
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, |
|
||||||
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, |
|
||||||
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, |
|
||||||
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, |
|
||||||
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, |
|
||||||
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] |
|
||||||
*/ |
|
||||||
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, |
|
||||||
float* bpptr, int bpstep, float* outptr, int outstep, |
|
||||||
float bias, float minval, float maxval, bool ifMinMaxAct) |
|
||||||
{ |
|
||||||
|
|
||||||
__m256 x00 = _mm256_load_ps(inptr); |
|
||||||
__m256 x10 = _mm256_load_ps(inptr + inpstep); |
|
||||||
__m256 x20 = _mm256_load_ps(inptr + inpstep*2); |
|
||||||
__m256 x30 = _mm256_load_ps(inptr + inpstep*3); |
|
||||||
__m256 x40 = _mm256_load_ps(inptr + inpstep*4); |
|
||||||
__m256 x50 = _mm256_load_ps(inptr + inpstep*5); |
|
||||||
__m256 x60 = _mm256_load_ps(inptr + inpstep*6); |
|
||||||
__m256 x70 = _mm256_load_ps(inptr + inpstep*7); |
|
||||||
__m256 z00, z10, z20, z30, z40, z50; |
|
||||||
|
|
||||||
{ |
|
||||||
__m256 s12_0, s34_0, s56_0; |
|
||||||
s12_0 = _mm256_add_ps(x10, x20); |
|
||||||
s34_0 = _mm256_add_ps(x30, x40); |
|
||||||
s56_0 = _mm256_add_ps(x50, x60); |
|
||||||
|
|
||||||
__m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); |
|
||||||
__m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); |
|
||||||
__m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); |
|
||||||
|
|
||||||
s12_0 = _mm256_sub_ps(x10, x20); |
|
||||||
s34_0 = _mm256_sub_ps(x30, x40); |
|
||||||
s56_0 = _mm256_sub_ps(x50, x60); |
|
||||||
__m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0))); |
|
||||||
__m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0)); |
|
||||||
__m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0)); |
|
||||||
__m256 y60 = _mm256_set1_ps(0.f), y70 = y60; |
|
||||||
|
|
||||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
|
||||||
|
|
||||||
transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); |
|
||||||
|
|
||||||
s12_0 = _mm256_add_ps(y10, y20); |
|
||||||
s34_0 = _mm256_add_ps(y30, y40); |
|
||||||
s56_0 = _mm256_add_ps(y50, y60); |
|
||||||
|
|
||||||
z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); |
|
||||||
z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); |
|
||||||
z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); |
|
||||||
|
|
||||||
s12_0 = _mm256_sub_ps(y10, y20); |
|
||||||
s34_0 = _mm256_sub_ps(y30, y40); |
|
||||||
s56_0 = _mm256_sub_ps(y50, y60); |
|
||||||
|
|
||||||
z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0))); |
|
||||||
z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0)); |
|
||||||
z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0)); |
|
||||||
|
|
||||||
__m256 vbias = _mm256_set1_ps(bias); |
|
||||||
z00 = _mm256_add_ps(vbias, z00); |
|
||||||
z10 = _mm256_add_ps(vbias, z10); |
|
||||||
z20 = _mm256_add_ps(vbias, z20); |
|
||||||
z30 = _mm256_add_ps(vbias, z30); |
|
||||||
z40 = _mm256_add_ps(vbias, z40); |
|
||||||
z50 = _mm256_add_ps(vbias, z50); |
|
||||||
} |
|
||||||
|
|
||||||
if (bpptr) |
|
||||||
{ |
|
||||||
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr)); |
|
||||||
z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep)); |
|
||||||
z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2)); |
|
||||||
z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3)); |
|
||||||
z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4)); |
|
||||||
z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5)); |
|
||||||
} |
|
||||||
|
|
||||||
if (ifMinMaxAct) |
|
||||||
{ |
|
||||||
__m256 vmax = _mm256_set1_ps(maxval); |
|
||||||
__m256 vmin = _mm256_set1_ps(minval); |
|
||||||
|
|
||||||
z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax); |
|
||||||
z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax); |
|
||||||
z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax); |
|
||||||
z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax); |
|
||||||
z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax); |
|
||||||
z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax); |
|
||||||
} |
|
||||||
|
|
||||||
__m128 lowM, highM; |
|
||||||
STORE6_ELE_FROM_16(outptr, z00, lowM, highM); |
|
||||||
STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM); |
|
||||||
STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM); |
|
||||||
STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM); |
|
||||||
STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM); |
|
||||||
STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM); |
|
||||||
_mm256_zeroupper(); |
|
||||||
} |
|
||||||
|
|
||||||
#endif |
|
||||||
} // namespace opt_AVX2
|
|
||||||
} // namespace dnn
|
|
||||||
} // namespace cv
|
|
@ -1,567 +0,0 @@ |
|||||||
// This file is part of OpenCV project.
|
|
||||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
||||||
// of this distribution and at http://opencv.org/license.html.
|
|
||||||
|
|
||||||
#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP |
|
||||||
#define OPENCV_FAST_CONVOLUTION_SIMD_HPP |
|
||||||
|
|
||||||
#include "opencv2/core/hal/intrin.hpp" |
|
||||||
#include <opencv2/core/utils/logger.hpp> |
|
||||||
|
|
||||||
namespace cv { |
|
||||||
namespace dnn { |
|
||||||
|
|
||||||
static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c, |
|
||||||
const float minval, const float maxval, bool ifMinMaxAct, const int outLen) |
|
||||||
{ |
|
||||||
std::vector<float> cbuffer(outLen, 0); |
|
||||||
float* cbuf = cbuffer.data(); |
|
||||||
for( int p = 0; p < np; p++ ) |
|
||||||
{ |
|
||||||
float ai = a[p]; |
|
||||||
for( int j = 0; j < outLen; j++ ) |
|
||||||
cbuf[j] += b[CONV_NR*p + j] * ai; |
|
||||||
} |
|
||||||
|
|
||||||
if (init_c) |
|
||||||
{ |
|
||||||
for(int j = 0; j < outLen; j++) |
|
||||||
{ |
|
||||||
c[j] += cbuf[j] + bias; |
|
||||||
if (ifMinMaxAct) |
|
||||||
c[j] = std::min(std::max(c[j], minval), maxval); |
|
||||||
} |
|
||||||
} |
|
||||||
else |
|
||||||
{ |
|
||||||
for(int j = 0; j < outLen; j++) |
|
||||||
{ |
|
||||||
c[j] = cbuf[j] + bias; |
|
||||||
if (ifMinMaxAct) |
|
||||||
c[j] = std::min(std::max(c[j], minval), maxval); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, |
|
||||||
const float minval, const float maxval, bool ifMinMaxAct, const int outLen) |
|
||||||
{ |
|
||||||
#if CV_SIMD128 |
|
||||||
// The outLen represents the valid output value in CONV_NR length.
|
|
||||||
// When outLen is very small, we use the no-SIMD branch.
|
|
||||||
const int CONV_NRby3 = CONV_NR/3; |
|
||||||
if (outLen > CONV_NRby3) |
|
||||||
{ |
|
||||||
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; // CONV_NR == 12
|
|
||||||
#if CONV_NR == 28 || CONV_NR == 24 |
|
||||||
v_float32x4 c3 = c0, c4 = c0, c5 = c0; |
|
||||||
#endif |
|
||||||
#if CONV_NR == 28 |
|
||||||
v_float32x4 c6 = c0; |
|
||||||
#endif |
|
||||||
for (int p = 0; p < np; p++, a++, b += CONV_NR) |
|
||||||
{ |
|
||||||
v_float32x4 a0 = v_setall_f32(a[0]); |
|
||||||
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); |
|
||||||
#if CONV_NR == 28 || CONV_NR == 24 |
|
||||||
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); |
|
||||||
#endif |
|
||||||
#if CONV_NR == 28 |
|
||||||
v_float32x4 b6 = v_load(b + 24); |
|
||||||
#endif |
|
||||||
|
|
||||||
c0 = v_fma(b0, a0, c0); |
|
||||||
c1 = v_fma(b1, a0, c1); |
|
||||||
c2 = v_fma(b2, a0, c2); |
|
||||||
#if CONV_NR == 28 || CONV_NR == 24 |
|
||||||
c3 = v_fma(b3, a0, c3); |
|
||||||
c4 = v_fma(b4, a0, c4); |
|
||||||
c5 = v_fma(b5, a0, c5); |
|
||||||
#endif |
|
||||||
#if CONV_NR == 28 |
|
||||||
c6 = v_fma(b6, a0, c6); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
if (init_c) |
|
||||||
{ |
|
||||||
c0 += v_load(c); |
|
||||||
c1 += v_load(c + 4); |
|
||||||
c2 += v_load(c + 8); |
|
||||||
#if CONV_NR == 28 || CONV_NR == 24 |
|
||||||
c3 += v_load(c + 12); |
|
||||||
c4 += v_load(c + 16); |
|
||||||
c5 += v_load(c + 20); |
|
||||||
#endif |
|
||||||
#if CONV_NR == 28 |
|
||||||
c6 += v_load(c + 24); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
if (ifMinMaxAct) |
|
||||||
{ |
|
||||||
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); |
|
||||||
c0 = v_min(v_max(c0, vmin), vmax); |
|
||||||
c1 = v_min(v_max(c1, vmin), vmax); |
|
||||||
c2 = v_min(v_max(c2, vmin), vmax); |
|
||||||
#if CONV_NR == 28 || CONV_NR == 24 |
|
||||||
c3 = v_min(v_max(c3, vmin), vmax); |
|
||||||
c4 = v_min(v_max(c4, vmin), vmax); |
|
||||||
c5 = v_min(v_max(c5, vmin), vmax); |
|
||||||
#endif |
|
||||||
#if CONV_NR == 28 |
|
||||||
c6 = v_min(v_max(c6, vmin), vmax); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
v_store(c, c0); |
|
||||||
v_store(c + 4, c1); |
|
||||||
v_store(c + 8, c2); |
|
||||||
#if CONV_NR == 28 || CONV_NR == 24 |
|
||||||
v_store(c + 12, c3); |
|
||||||
v_store(c + 16, c4); |
|
||||||
v_store(c + 20, c5); |
|
||||||
#endif |
|
||||||
#if CONV_NR == 28 |
|
||||||
v_store(c + 24, c6); |
|
||||||
#endif |
|
||||||
} |
|
||||||
else |
|
||||||
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen); |
|
||||||
#else |
|
||||||
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
#if CV_SIMD128 |
|
||||||
#if CONV_MR == 4 && CONV_NR == 24 |
|
||||||
static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c) |
|
||||||
{ |
|
||||||
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0; |
|
||||||
v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6; |
|
||||||
v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12; |
|
||||||
v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18; |
|
||||||
|
|
||||||
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) |
|
||||||
{ |
|
||||||
v_float32x4 a0 = v_setall_f32(a[0]); |
|
||||||
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); |
|
||||||
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); |
|
||||||
|
|
||||||
c0 = v_fma(b0, a0, c0); |
|
||||||
c1 = v_fma(b1, a0, c1); |
|
||||||
c2 = v_fma(b2, a0, c2); |
|
||||||
c3 = v_fma(b3, a0, c3); |
|
||||||
c4 = v_fma(b4, a0, c4); |
|
||||||
c5 = v_fma(b5, a0, c5); |
|
||||||
|
|
||||||
a0 = v_setall_f32(a[1]); |
|
||||||
c6 = v_fma(b0, a0, c6); |
|
||||||
c7 = v_fma(b1, a0, c7); |
|
||||||
c8 = v_fma(b2, a0, c8); |
|
||||||
c9 = v_fma(b3, a0, c9); |
|
||||||
c10 = v_fma(b4, a0, c10); |
|
||||||
c11 = v_fma(b5, a0, c11); |
|
||||||
|
|
||||||
a0 = v_setall_f32(a[2]); |
|
||||||
c12 = v_fma(b0, a0, c12); |
|
||||||
c13 = v_fma(b1, a0, c13); |
|
||||||
c14 = v_fma(b2, a0, c14); |
|
||||||
c15 = v_fma(b3, a0, c15); |
|
||||||
c16 = v_fma(b4, a0, c16); |
|
||||||
c17 = v_fma(b5, a0, c17); |
|
||||||
|
|
||||||
a0 = v_setall_f32(a[3]); |
|
||||||
c18 = v_fma(b0, a0, c18); |
|
||||||
c19 = v_fma(b1, a0, c19); |
|
||||||
c20 = v_fma(b2, a0, c20); |
|
||||||
c21 = v_fma(b3, a0, c21); |
|
||||||
c22 = v_fma(b4, a0, c22); |
|
||||||
c23 = v_fma(b5, a0, c23); |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
c0 += v_load(c); |
|
||||||
c1 += v_load(c + 4); |
|
||||||
c2 += v_load(c + 8); |
|
||||||
c3 += v_load(c + 12); |
|
||||||
c4 += v_load(c + 16); |
|
||||||
c5 += v_load(c + 20); |
|
||||||
|
|
||||||
c6 += v_load(c + ldc); |
|
||||||
c7 += v_load(c + ldc + 4); |
|
||||||
c8 += v_load(c + ldc + 8); |
|
||||||
c9 += v_load(c + ldc + 12); |
|
||||||
c10 += v_load(c + ldc + 16); |
|
||||||
c11 += v_load(c + ldc + 20); |
|
||||||
|
|
||||||
c12 += v_load(c + ldc*2); |
|
||||||
c13 += v_load(c + ldc*2 + 4); |
|
||||||
c14 += v_load(c + ldc*2 + 8); |
|
||||||
c15 += v_load(c + ldc*2 + 12); |
|
||||||
c16 += v_load(c + ldc*2 + 16); |
|
||||||
c17 += v_load(c + ldc*2 + 20); |
|
||||||
|
|
||||||
c18 += v_load(c + ldc*3); |
|
||||||
c19 += v_load(c + ldc*3 + 4); |
|
||||||
c20 += v_load(c + ldc*3 + 8); |
|
||||||
c21 += v_load(c + ldc*3 + 12); |
|
||||||
c22 += v_load(c + ldc*3 + 16); |
|
||||||
c23 += v_load(c + ldc*3 + 20); |
|
||||||
} |
|
||||||
|
|
||||||
v_store(c, c0); |
|
||||||
v_store(c + 4, c1); |
|
||||||
v_store(c + 8, c2); |
|
||||||
v_store(c + 12, c3); |
|
||||||
v_store(c + 16, c4); |
|
||||||
v_store(c + 20, c5); |
|
||||||
|
|
||||||
v_store(c + ldc, c6); |
|
||||||
v_store(c + ldc + 4, c7); |
|
||||||
v_store(c + ldc + 8, c8); |
|
||||||
v_store(c + ldc + 12, c9); |
|
||||||
v_store(c + ldc + 16, c10); |
|
||||||
v_store(c + ldc + 20, c11); |
|
||||||
|
|
||||||
v_store(c + ldc * 2, c12); |
|
||||||
v_store(c + ldc * 2 + 4, c13); |
|
||||||
v_store(c + ldc * 2 + 8, c14); |
|
||||||
v_store(c + ldc * 2 + 12, c15); |
|
||||||
v_store(c + ldc * 2 + 16, c16); |
|
||||||
v_store(c + ldc * 2 + 20, c17); |
|
||||||
|
|
||||||
v_store(c + ldc * 3, c18); |
|
||||||
v_store(c + ldc * 3 + 4, c19); |
|
||||||
v_store(c + ldc * 3 + 8, c20); |
|
||||||
v_store(c + ldc * 3 + 12, c21); |
|
||||||
v_store(c + ldc * 3 + 16, c22); |
|
||||||
v_store(c + ldc * 3 + 20, c23); |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c) |
|
||||||
{ |
|
||||||
CV_Assert(CONV_NR >= 4); |
|
||||||
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; |
|
||||||
v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0; |
|
||||||
|
|
||||||
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) |
|
||||||
{ |
|
||||||
v_float32x4 a0 = v_setall_f32(a[0]); |
|
||||||
v_float32x4 a1 = v_setall_f32(a[1]); |
|
||||||
v_float32x4 a2 = v_setall_f32(a[2]); |
|
||||||
v_float32x4 a3 = v_setall_f32(a[3]); |
|
||||||
|
|
||||||
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4); |
|
||||||
|
|
||||||
c0 = v_fma(b0, a0, c0); |
|
||||||
c1 = v_fma(b1, a0, c1); |
|
||||||
|
|
||||||
c2 = v_fma(b0, a1, c2); |
|
||||||
c3 = v_fma(b1, a1, c3); |
|
||||||
|
|
||||||
c4 = v_fma(b0, a2, c4); |
|
||||||
c5 = v_fma(b1, a2, c5); |
|
||||||
|
|
||||||
c6 = v_fma(b0, a3, c6); |
|
||||||
c7 = v_fma(b1, a3, c7); |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
c0 += v_load(c); |
|
||||||
c1 += v_load(c + 4); |
|
||||||
|
|
||||||
c2 += v_load(c + ldc); |
|
||||||
c3 += v_load(c + ldc + 4); |
|
||||||
|
|
||||||
c4 += v_load(c + ldc*2); |
|
||||||
c5 += v_load(c + ldc*2 + 4); |
|
||||||
|
|
||||||
c6 += v_load(c + ldc*3); |
|
||||||
c7 += v_load(c + ldc*3 + 4); |
|
||||||
} |
|
||||||
|
|
||||||
v_store(c, c0); |
|
||||||
v_store(c + 4, c1); |
|
||||||
v_store(c + ldc, c2); |
|
||||||
v_store(c + ldc + 4, c3); |
|
||||||
v_store(c + ldc * 2, c4); |
|
||||||
v_store(c + ldc * 2 + 4, c5); |
|
||||||
v_store(c + ldc * 3, c6); |
|
||||||
v_store(c + ldc * 3 + 4, c7); |
|
||||||
} |
|
||||||
|
|
||||||
static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c) |
|
||||||
{ |
|
||||||
CV_Assert(CONV_NR >= 4); |
|
||||||
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; |
|
||||||
|
|
||||||
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) |
|
||||||
{ |
|
||||||
v_float32x4 a0 = v_setall_f32(a[0]); |
|
||||||
v_float32x4 a1 = v_setall_f32(a[1]); |
|
||||||
v_float32x4 a2 = v_setall_f32(a[2]); |
|
||||||
v_float32x4 a3 = v_setall_f32(a[3]); |
|
||||||
|
|
||||||
v_float32x4 b0 = v_load(b); |
|
||||||
|
|
||||||
c0 = v_fma(b0, a0, c0); |
|
||||||
c1 = v_fma(b0, a1, c1); |
|
||||||
c2 = v_fma(b0, a2, c2); |
|
||||||
c3 = v_fma(b0, a3, c3); |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
c0 += v_load(c); |
|
||||||
c1 += v_load(c + ldc); |
|
||||||
c2 += v_load(c + ldc*2); |
|
||||||
c3 += v_load(c + ldc*3); |
|
||||||
} |
|
||||||
|
|
||||||
v_store(c, c0); |
|
||||||
v_store(c + ldc, c1); |
|
||||||
v_store(c + ldc * 2, c2); |
|
||||||
v_store(c + ldc * 3, c3); |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen) |
|
||||||
{ |
|
||||||
std::vector<float> cbuffer(CONV_MR * outLen, 0); |
|
||||||
float* cbuf = cbuffer.data(); |
|
||||||
for( int p = 0; p < np; p++ ) |
|
||||||
{ |
|
||||||
for( int i = 0; i < CONV_MR; i++ ) |
|
||||||
{ |
|
||||||
float ai = a[CONV_MR*p + i]; |
|
||||||
for( int j = 0; j < outLen; j++ ) |
|
||||||
cbuf[i * outLen+j] += b[CONV_NR*p + j] * ai; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
for(int i = 0; i < CONV_MR; i++) |
|
||||||
{ |
|
||||||
for(int j = 0; j < outLen; j++) |
|
||||||
c[i*ldc + j] += cbuf[i*outLen + j]; |
|
||||||
} |
|
||||||
} |
|
||||||
else |
|
||||||
{ |
|
||||||
for(int i = 0; i < CONV_MR; i++) |
|
||||||
{ |
|
||||||
for(int j = 0; j < outLen; j++) |
|
||||||
c[i*ldc + j] = cbuf[i*outLen + j]; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen) |
|
||||||
{ |
|
||||||
// The possible outLen range is [24, 8~1].
|
|
||||||
#if CV_SIMD128 |
|
||||||
#if CONV_MR == 4 && CONV_NR == 24 |
|
||||||
const int CONV_NRby3 = CONV_NR/3; |
|
||||||
if (outLen > CONV_NRby3) |
|
||||||
{ |
|
||||||
convBlock4x24(np, a, b, c, ldc, init_c); |
|
||||||
return; |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
if (outLen <= 8 && outLen > 4) |
|
||||||
{ |
|
||||||
convBlock4x8(np, a, b, c, ldc, init_c); |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
if (outLen <= 4 && outLen > 1) |
|
||||||
{ |
|
||||||
convBlock4x4(np, a, b, c, ldc, init_c); |
|
||||||
return; |
|
||||||
} |
|
||||||
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen); |
|
||||||
#else |
|
||||||
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen); |
|
||||||
#endif |
|
||||||
} |
|
||||||
} // namespace dnn
|
|
||||||
|
|
||||||
namespace opt_NEON |
|
||||||
{ |
|
||||||
#if CV_TRY_NEON |
|
||||||
void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, bool init_c) |
|
||||||
{ |
|
||||||
#if CONV_MR == 4 && CONV_NR == 28 // AARCH64
|
|
||||||
{ |
|
||||||
float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00; |
|
||||||
float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10; |
|
||||||
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20; |
|
||||||
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30; |
|
||||||
|
|
||||||
for( int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR ) |
|
||||||
{ |
|
||||||
float32x4_t a0 = vld1q_f32(a), b0, b1, b2; |
|
||||||
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); |
|
||||||
|
|
||||||
c00 = vfmaq_laneq_f32(c00, b0, a0, 0); |
|
||||||
c01 = vfmaq_laneq_f32(c01, b1, a0, 0); |
|
||||||
c02 = vfmaq_laneq_f32(c02, b2, a0, 0); |
|
||||||
c10 = vfmaq_laneq_f32(c10, b0, a0, 1); |
|
||||||
c11 = vfmaq_laneq_f32(c11, b1, a0, 1); |
|
||||||
c12 = vfmaq_laneq_f32(c12, b2, a0, 1); |
|
||||||
c20 = vfmaq_laneq_f32(c20, b0, a0, 2); |
|
||||||
c21 = vfmaq_laneq_f32(c21, b1, a0, 2); |
|
||||||
c22 = vfmaq_laneq_f32(c22, b2, a0, 2); |
|
||||||
c30 = vfmaq_laneq_f32(c30, b0, a0, 3); |
|
||||||
c31 = vfmaq_laneq_f32(c31, b1, a0, 3); |
|
||||||
c32 = vfmaq_laneq_f32(c32, b2, a0, 3); |
|
||||||
|
|
||||||
b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20); |
|
||||||
|
|
||||||
c03 = vfmaq_laneq_f32(c03, b0, a0, 0); |
|
||||||
c04 = vfmaq_laneq_f32(c04, b1, a0, 0); |
|
||||||
c05 = vfmaq_laneq_f32(c05, b2, a0, 0); |
|
||||||
c13 = vfmaq_laneq_f32(c13, b0, a0, 1); |
|
||||||
c14 = vfmaq_laneq_f32(c14, b1, a0, 1); |
|
||||||
c15 = vfmaq_laneq_f32(c15, b2, a0, 1); |
|
||||||
c23 = vfmaq_laneq_f32(c23, b0, a0, 2); |
|
||||||
c24 = vfmaq_laneq_f32(c24, b1, a0, 2); |
|
||||||
c25 = vfmaq_laneq_f32(c25, b2, a0, 2); |
|
||||||
c33 = vfmaq_laneq_f32(c33, b0, a0, 3); |
|
||||||
c34 = vfmaq_laneq_f32(c34, b1, a0, 3); |
|
||||||
c35 = vfmaq_laneq_f32(c35, b2, a0, 3); |
|
||||||
|
|
||||||
b0 = vld1q_f32(b + 24); |
|
||||||
c06 = vfmaq_laneq_f32(c06, b0, a0, 0); |
|
||||||
c16 = vfmaq_laneq_f32(c16, b0, a0, 1); |
|
||||||
c26 = vfmaq_laneq_f32(c26, b0, a0, 2); |
|
||||||
c36 = vfmaq_laneq_f32(c36, b0, a0, 3); |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
c00 = vaddq_f32(c00, vld1q_f32(c)); |
|
||||||
c01 = vaddq_f32(c01, vld1q_f32(c + 4)); |
|
||||||
c02 = vaddq_f32(c02, vld1q_f32(c + 8)); |
|
||||||
c03 = vaddq_f32(c03, vld1q_f32(c + 12)); |
|
||||||
c04 = vaddq_f32(c04, vld1q_f32(c + 16)); |
|
||||||
c05 = vaddq_f32(c05, vld1q_f32(c + 20)); |
|
||||||
c06 = vaddq_f32(c06, vld1q_f32(c + 24)); |
|
||||||
|
|
||||||
c10 = vaddq_f32(c10, vld1q_f32(c + ldc)); |
|
||||||
c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4)); |
|
||||||
c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8)); |
|
||||||
c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12)); |
|
||||||
c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16)); |
|
||||||
c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20)); |
|
||||||
c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24)); |
|
||||||
|
|
||||||
c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2)); |
|
||||||
c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4)); |
|
||||||
c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8)); |
|
||||||
c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12)); |
|
||||||
c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16)); |
|
||||||
c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20)); |
|
||||||
c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24)); |
|
||||||
|
|
||||||
c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3)); |
|
||||||
c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4)); |
|
||||||
c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8)); |
|
||||||
c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12)); |
|
||||||
c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16)); |
|
||||||
c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20)); |
|
||||||
c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24)); |
|
||||||
} |
|
||||||
|
|
||||||
vst1q_f32(c, c00); vst1q_f32(c+4, c01); |
|
||||||
vst1q_f32(c+8, c02); vst1q_f32(c+12, c03); |
|
||||||
vst1q_f32(c+16, c04); vst1q_f32(c+20, c05); |
|
||||||
vst1q_f32(c+24, c06); |
|
||||||
|
|
||||||
vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11); |
|
||||||
vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13); |
|
||||||
vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15); |
|
||||||
vst1q_f32(c+ldc+24, c16); |
|
||||||
|
|
||||||
vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21); |
|
||||||
vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23); |
|
||||||
vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25); |
|
||||||
vst1q_f32(c+ldc*2+24, c26); |
|
||||||
|
|
||||||
vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31); |
|
||||||
vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33); |
|
||||||
vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35); |
|
||||||
vst1q_f32(c+ldc*3+24, c36); |
|
||||||
} |
|
||||||
#elif CONV_MR == 4 && CONV_NR == 12 // ARMv7
|
|
||||||
{ |
|
||||||
float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0; |
|
||||||
float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3; |
|
||||||
float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6; |
|
||||||
float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9; |
|
||||||
|
|
||||||
|
|
||||||
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0; |
|
||||||
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); |
|
||||||
|
|
||||||
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) |
|
||||||
{ |
|
||||||
a0 = vld1_f32(a), a1 = vld1_f32(a+2); |
|
||||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); |
|
||||||
|
|
||||||
c0 = vmlaq_lane_f32(c0, b0, a0, 0); |
|
||||||
c1 = vmlaq_lane_f32(c1, b1, a0, 0); |
|
||||||
c2 = vmlaq_lane_f32(c2, b2, a0, 0); |
|
||||||
|
|
||||||
c3 = vmlaq_lane_f32(c3, b0, a0, 1); |
|
||||||
c4 = vmlaq_lane_f32(c4, b1, a0, 1); |
|
||||||
c5 = vmlaq_lane_f32(c5, b2, a0, 1); |
|
||||||
|
|
||||||
c6 = vmlaq_lane_f32(c6, b0, a1, 0); |
|
||||||
c7 = vmlaq_lane_f32(c7, b1, a1, 0); |
|
||||||
c8 = vmlaq_lane_f32(c8, b2, a1, 0); |
|
||||||
|
|
||||||
c9 = vmlaq_lane_f32(c9 , b0, a1, 1); |
|
||||||
c10 = vmlaq_lane_f32(c10, b1, a1, 1); |
|
||||||
c11 = vmlaq_lane_f32(c11, b2, a1, 1); |
|
||||||
} |
|
||||||
|
|
||||||
if (!init_c) |
|
||||||
{ |
|
||||||
c0 = vaddq_f32(c0, vld1q_f32(c)); |
|
||||||
c1 = vaddq_f32(c1, vld1q_f32(c + 4)); |
|
||||||
c2 = vaddq_f32(c2, vld1q_f32(c + 8)); |
|
||||||
|
|
||||||
c3 = vaddq_f32(c3, vld1q_f32(c + ldc)); |
|
||||||
c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4)); |
|
||||||
c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8)); |
|
||||||
|
|
||||||
c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2)); |
|
||||||
c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4)); |
|
||||||
c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8)); |
|
||||||
|
|
||||||
c9 = vaddq_f32(c9 , vld1q_f32(c + ldc * 3)); |
|
||||||
c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4)); |
|
||||||
c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8)); |
|
||||||
} |
|
||||||
|
|
||||||
vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2); |
|
||||||
vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5); |
|
||||||
vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8); |
|
||||||
vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11); |
|
||||||
} |
|
||||||
//#else
|
|
||||||
//#error "unsupported CONV_MR and/or CONV_NR in convBlock_NEON."
|
|
||||||
#endif |
|
||||||
} |
|
||||||
#endif |
|
||||||
} // namespace opt_NEON
|
|
||||||
|
|
||||||
} // namespace cv
|
|
||||||
#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue