|
|
|
@ -124,12 +124,12 @@ func ff_h_32x32_rvv, zve32x |
|
|
|
|
vsetvli zero, t0, e8, m2, ta, ma |
|
|
|
|
|
|
|
|
|
.rept 2
|
|
|
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
|
|
|
|
.irp n, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
|
|
|
|
lbu t1, (a2) |
|
|
|
|
addi a2, a2, -1 |
|
|
|
|
vmv.v.x v\n, t1 |
|
|
|
|
.endr |
|
|
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
|
|
|
|
.irp n, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
|
|
|
|
vse8.v v\n, (a0) |
|
|
|
|
add a0, a0, a1 |
|
|
|
|
.endr |
|
|
|
@ -142,12 +142,12 @@ func ff_h_16x16_rvv, zve32x |
|
|
|
|
addi a2, a2, 15 |
|
|
|
|
vsetivli zero, 16, e8, m1, ta, ma |
|
|
|
|
|
|
|
|
|
.irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 |
|
|
|
|
.irp n, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 |
|
|
|
|
lbu t1, (a2) |
|
|
|
|
addi a2, a2, -1 |
|
|
|
|
vmv.v.x v\n, t1 |
|
|
|
|
.endr |
|
|
|
|
.irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 |
|
|
|
|
.irp n, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 |
|
|
|
|
vse8.v v\n, (a0) |
|
|
|
|
add a0, a0, a1 |
|
|
|
|
.endr |
|
|
|
@ -160,12 +160,12 @@ func ff_h_8x8_rvv, zve32x |
|
|
|
|
addi a2, a2, 7 |
|
|
|
|
vsetivli zero, 8, e8, mf2, ta, ma |
|
|
|
|
|
|
|
|
|
.irp n 8, 9, 10, 11, 12, 13, 14, 15 |
|
|
|
|
.irp n, 8, 9, 10, 11, 12, 13, 14, 15 |
|
|
|
|
lbu t1, (a2) |
|
|
|
|
addi a2, a2, -1 |
|
|
|
|
vmv.v.x v\n, t1 |
|
|
|
|
.endr |
|
|
|
|
.irp n 8, 9, 10, 11, 12, 13, 14 |
|
|
|
|
.irp n, 8, 9, 10, 11, 12, 13, 14 |
|
|
|
|
vse8.v v\n, (a0) |
|
|
|
|
add a0, a0, a1 |
|
|
|
|
.endr |
|
|
|
@ -193,7 +193,7 @@ func ff_tm_32x32_rvv, zve32x |
|
|
|
|
lbu a4, -1(a3) |
|
|
|
|
li t5, 32 |
|
|
|
|
|
|
|
|
|
.irp offset 31, 23, 15, 7 |
|
|
|
|
.irp offset, 31, 23, 15, 7 |
|
|
|
|
vsetvli zero, t5, e16, m4, ta, ma |
|
|
|
|
vle8.v v8, (a3) |
|
|
|
|
vzext.vf2 v28, v8 |
|
|
|
@ -201,7 +201,7 @@ func ff_tm_32x32_rvv, zve32x |
|
|
|
|
tm_sum4 v0, v4, v8, v12, v28, \offset |
|
|
|
|
tm_sum4 v16, v20, v24, v28, v28, (\offset-4) |
|
|
|
|
|
|
|
|
|
.irp n 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
|
|
.irp n, 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
|
|
vmax.vx v\n, v\n, zero |
|
|
|
|
.endr |
|
|
|
|
|
|
|
|
@ -227,12 +227,12 @@ func ff_tm_16x16_rvv, zve32x |
|
|
|
|
tm_sum4 v16, v18, v20, v22, v30, 7 |
|
|
|
|
tm_sum4 v24, v26, v28, v30, v30, 3 |
|
|
|
|
|
|
|
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
|
|
|
|
.irp n, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
|
|
|
|
vmax.vx v\n, v\n, zero |
|
|
|
|
.endr |
|
|
|
|
|
|
|
|
|
vsetvli zero, zero, e8, m1, ta, ma |
|
|
|
|
.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 |
|
|
|
|
.irp n, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 |
|
|
|
|
vnclipu.wi v\n, v\n, 0 |
|
|
|
|
vse8.v v\n, (a0) |
|
|
|
|
add a0, a0, a1 |
|
|
|
@ -252,12 +252,12 @@ func ff_tm_8x8_rvv, zve32x |
|
|
|
|
tm_sum4 v16, v17, v18, v19, v28, 7 |
|
|
|
|
tm_sum4 v20, v21, v22, v23, v28, 3 |
|
|
|
|
|
|
|
|
|
.irp n 16, 17, 18, 19, 20, 21, 22, 23 |
|
|
|
|
.irp n, 16, 17, 18, 19, 20, 21, 22, 23 |
|
|
|
|
vmax.vx v\n, v\n, zero |
|
|
|
|
.endr |
|
|
|
|
|
|
|
|
|
vsetvli zero, zero, e8, mf2, ta, ma |
|
|
|
|
.irp n 16, 17, 18, 19, 20, 21, 22 |
|
|
|
|
.irp n, 16, 17, 18, 19, 20, 21, 22 |
|
|
|
|
vnclipu.wi v\n, v\n, 0 |
|
|
|
|
vse8.v v\n, (a0) |
|
|
|
|
add a0, a0, a1 |
|
|
|
@ -276,12 +276,12 @@ func ff_tm_4x4_rvv, zve32x |
|
|
|
|
|
|
|
|
|
tm_sum4 v16, v17, v18, v19, v28, 3 |
|
|
|
|
|
|
|
|
|
.irp n 16, 17, 18, 19 |
|
|
|
|
.irp n, 16, 17, 18, 19 |
|
|
|
|
vmax.vx v\n, v\n, zero |
|
|
|
|
.endr |
|
|
|
|
|
|
|
|
|
vsetvli zero, zero, e8, mf4, ta, ma |
|
|
|
|
.irp n 16, 17, 18 |
|
|
|
|
.irp n, 16, 17, 18 |
|
|
|
|
vnclipu.wi v\n, v\n, 0 |
|
|
|
|
vse8.v v\n, (a0) |
|
|
|
|
add a0, a0, a1 |
|
|
|
|