diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index e9ac5576d..60cd92519 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -169,7 +169,7 @@ public: for( k = 0; k < vecsize; k += 4 ) { - vfloat32x4 v = v_load_aligned(sptr + k); + vfloat32x4 v = v_load(sptr + k); vs0 += v*v_load_aligned(wptr + k); vs1 += v*v_load_aligned(wptr + wstep + k); vs2 += v*v_load_aligned(wptr + wstep*2 + k); diff --git a/modules/dnn/src/layers/layers_common.avx2.cpp b/modules/dnn/src/layers/layers_common.avx2.cpp index 89650778a..2c6ff6988 100644 --- a/modules/dnn/src/layers/layers_common.avx2.cpp +++ b/modules/dnn/src/layers/layers_common.avx2.cpp @@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, for( int k = 0; k < vecsize; k += 8, wptr += 8 ) { - __m256 v = _mm256_load_ps(vec + k); + __m256 v = _mm256_loadu_ps(vec + k); vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1); @@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, for( int k = 0; k < vecsize; k += 8, wptr += 8 ) { - __m256 v = _mm256_load_ps(vec + k); + __m256 v = _mm256_loadu_ps(vec + k); vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); }