27 int dst_stride,
int src_stride1,
int src_stride2,
int h)
29 int stride1_2, stride1_3, stride1_4;
30 int stride2_2, stride2_3, stride2_4;
32 "slli.d %[stride1_2], %[srcStride1], 1 \n\t"
33 "slli.d %[stride2_2], %[srcStride2], 1 \n\t"
34 "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t"
35 "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t"
36 "slli.d %[stride1_4], %[stride1_2], 1 \n\t"
37 "slli.d %[stride2_4], %[stride2_2], 1 \n\t"
39 "vld $vr0, %[src1], 0 \n\t"
40 "vldx $vr1, %[src1], %[srcStride1] \n\t"
41 "vldx $vr2, %[src1], %[stride1_2] \n\t"
42 "vldx $vr3, %[src1], %[stride1_3] \n\t"
43 "add.d %[src1], %[src1], %[stride1_4] \n\t"
45 "vld $vr4, %[src2], 0 \n\t"
46 "vldx $vr5, %[src2], %[srcStride2] \n\t"
47 "vldx $vr6, %[src2], %[stride2_2] \n\t"
48 "vldx $vr7, %[src2], %[stride2_3] \n\t"
49 "add.d %[src2], %[src2], %[stride2_4] \n\t"
51 "addi.d %[h], %[h], -4 \n\t"
53 "vavgr.bu $vr0, $vr4, $vr0 \n\t"
54 "vavgr.bu $vr1, $vr5, $vr1 \n\t"
55 "vavgr.bu $vr2, $vr6, $vr2 \n\t"
56 "vavgr.bu $vr3, $vr7, $vr3 \n\t"
57 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
58 "add.d %[dst], %[dst], %[dstStride] \n\t"
59 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
60 "add.d %[dst], %[dst], %[dstStride] \n\t"
61 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
62 "add.d %[dst], %[dst], %[dstStride] \n\t"
63 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
64 "add.d %[dst], %[dst], %[dstStride] \n\t"
68 [
h]
"+&r"(
h), [stride1_2]
"=&r"(stride1_2),
69 [stride1_3]
"=&r"(stride1_3), [stride1_4]
"=&r"(stride1_4),
70 [stride2_2]
"=&r"(stride2_2), [stride2_3]
"=&r"(stride2_3),
71 [stride2_4]
"=&r"(stride2_4)
72 : [dstStride]
"r"(dst_stride), [srcStride1]
"r"(src_stride1),
73 [srcStride2]
"r"(src_stride2)
80 int dst_stride,
int src_stride1,
int src_stride2,
int h)
82 int stride1_2, stride1_3, stride1_4;
83 int stride2_2, stride2_3, stride2_4;
84 int dststride2, dststride3, dststride4;
86 "slli.d %[stride1_2], %[srcStride1], 1 \n\t"
87 "slli.d %[stride2_2], %[srcStride2], 1 \n\t"
88 "slli.d %[dststride2], %[dstStride], 1 \n\t"
89 "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t"
90 "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t"
91 "add.d %[dststride3], %[dststride2], %[dstStride] \n\t"
92 "slli.d %[stride1_4], %[stride1_2], 1 \n\t"
93 "slli.d %[stride2_4], %[stride2_2], 1 \n\t"
94 "slli.d %[dststride4], %[dststride2], 1 \n\t"
96 "vld $vr0, %[src1], 0 \n\t"
97 "vldx $vr1, %[src1], %[srcStride1] \n\t"
98 "vldx $vr2, %[src1], %[stride1_2] \n\t"
99 "vldx $vr3, %[src1], %[stride1_3] \n\t"
100 "add.d %[src1], %[src1], %[stride1_4] \n\t"
102 "vld $vr4, %[src2], 0 \n\t"
103 "vldx $vr5, %[src2], %[srcStride2] \n\t"
104 "vldx $vr6, %[src2], %[stride2_2] \n\t"
105 "vldx $vr7, %[src2], %[stride2_3] \n\t"
106 "add.d %[src2], %[src2], %[stride2_4] \n\t"
108 "addi.d %[h], %[h], -4 \n\t"
110 "vavgr.bu $vr0, $vr4, $vr0 \n\t"
111 "vavgr.bu $vr1, $vr5, $vr1 \n\t"
112 "vavgr.bu $vr2, $vr6, $vr2 \n\t"
113 "vavgr.bu $vr3, $vr7, $vr3 \n\t"
114 "vst $vr0, %[dst], 0 \n\t"
115 "vstx $vr1, %[dst], %[dstStride] \n\t"
116 "vstx $vr2, %[dst], %[dststride2] \n\t"
117 "vstx $vr3, %[dst], %[dststride3] \n\t"
118 "add.d %[dst], %[dst], %[dststride4] \n\t"
122 [
h]
"+&r"(
h), [stride1_2]
"=&r"(stride1_2),
123 [stride1_3]
"=&r"(stride1_3), [stride1_4]
"=&r"(stride1_4),
124 [stride2_2]
"=&r"(stride2_2), [stride2_3]
"=&r"(stride2_3),
125 [stride2_4]
"=&r"(stride2_4), [dststride2]
"=&r"(dststride2),
126 [dststride3]
"=&r"(dststride3), [dststride4]
"=&r"(dststride4)
127 : [dstStride]
"r"(dst_stride), [srcStride1]
"r"(src_stride1),
128 [srcStride2]
"r"(src_stride2)
134 ptrdiff_t line_size,
int h)
139 ptrdiff_t stride2, stride3, stride4;
142 "beqz %[h_8], 2f \n\t"
143 "slli.d %[stride2], %[stride], 1 \n\t"
144 "add.d %[stride3], %[stride2], %[stride] \n\t"
145 "slli.d %[stride4], %[stride2], 1 \n\t"
147 "ld.d %[tmp0], %[src], 0x0 \n\t"
148 "ldx.d %[tmp1], %[src], %[stride] \n\t"
149 "ldx.d %[tmp2], %[src], %[stride2] \n\t"
150 "ldx.d %[tmp3], %[src], %[stride3] \n\t"
151 "add.d %[src], %[src], %[stride4] \n\t"
152 "ld.d %[tmp4], %[src], 0x0 \n\t"
153 "ldx.d %[tmp5], %[src], %[stride] \n\t"
154 "ldx.d %[tmp6], %[src], %[stride2] \n\t"
155 "ldx.d %[tmp7], %[src], %[stride3] \n\t"
156 "add.d %[src], %[src], %[stride4] \n\t"
158 "addi.d %[h_8], %[h_8], -1 \n\t"
160 "st.d %[tmp0], %[dst], 0x0 \n\t"
161 "stx.d %[tmp1], %[dst], %[stride] \n\t"
162 "stx.d %[tmp2], %[dst], %[stride2] \n\t"
163 "stx.d %[tmp3], %[dst], %[stride3] \n\t"
164 "add.d %[dst], %[dst], %[stride4] \n\t"
165 "st.d %[tmp4], %[dst], 0x0 \n\t"
166 "stx.d %[tmp5], %[dst], %[stride] \n\t"
167 "stx.d %[tmp6], %[dst], %[stride2] \n\t"
168 "stx.d %[tmp7], %[dst], %[stride3] \n\t"
169 "add.d %[dst], %[dst], %[stride4] \n\t"
170 "bnez %[h_8], 1b \n\t"
173 "beqz %[res], 4f \n\t"
175 "ld.d %[tmp0], %[src], 0x0 \n\t"
176 "add.d %[src], %[src], %[stride] \n\t"
177 "addi.d %[res], %[res], -1 \n\t"
178 "st.d %[tmp0], %[dst], 0x0 \n\t"
179 "add.d %[dst], %[dst], %[stride] \n\t"
180 "bnez %[res], 3b \n\t"
182 : [tmp0]
"=&r"(
tmp[0]), [tmp1]
"=&r"(
tmp[1]),
183 [tmp2]
"=&r"(
tmp[2]), [tmp3]
"=&r"(
tmp[3]),
184 [tmp4]
"=&r"(
tmp[4]), [tmp5]
"=&r"(
tmp[5]),
185 [tmp6]
"=&r"(
tmp[6]), [tmp7]
"=&r"(
tmp[7]),
187 [h_8]
"+&r"(h_8), [res]
"+&r"(res),
188 [stride2]
"=&r"(stride2), [stride3]
"=&r"(stride3),
189 [stride4]
"=&r"(stride4)
200 ptrdiff_t line_size,
int h)
203 ptrdiff_t stride2 = line_size << 1;
204 ptrdiff_t stride3 = stride2 + line_size;
205 ptrdiff_t stride4 = line_size << 2;
208 for (
int i = 0;
i < h_4;
i++) {
209 src0 = __lsx_vld(pixels, 0);
210 src1 = __lsx_vldx(pixels, line_size);
211 src2 = __lsx_vldx(pixels, stride2);
212 src3 = __lsx_vldx(pixels, stride3);
217 __lsx_vstx(src3,
block, stride3);
225 ptrdiff_t line_size,
int h)
232 ptrdiff_t line_size,
int h)
235 line_size, line_size,
h);
239 ptrdiff_t line_size,
int h)
246 ptrdiff_t line_size,
int h)
249 line_size, line_size,
h);
257 ptrdiff_t line_size,
int h)
261 int32_t stride2x = line_size << 1;
262 int32_t stride4x = line_size << 2;
263 int32_t stride3x = stride2x + line_size;
264 uint8_t*
_src = (uint8_t*)pixels + 1;
266 for (
int i = 0;
i < h_4;
i++) {
267 src0 = __lasx_xvld(pixels, 0);
269 src3 = __lasx_xvldx(pixels, stride3x);
270 src4 = __lasx_xvld(
_src, 0);
272 src7 = __lasx_xvldx(
_src, stride3x);
300 ptrdiff_t line_size,
int h)
303 int32_t stride2x = line_size << 1;
304 int32_t stride4x = line_size << 2;
305 int32_t stride3x = stride2x + line_size;
306 uint8_t*
_src = (uint8_t*)pixels;
309 for (
int i = 0;
i < h_4;
i++) {
312 src3 = __lasx_xvldx(
_src, stride3x);
314 src4 = __lasx_xvld(
_src, 0);
336 const uint8_t *pixels,
337 ptrdiff_t line_size,
int h)
340 __m256i sum0, sum1, sum2;
341 src0 = __lasx_xvld(pixels, 0);
342 src1 = __lasx_xvld(pixels, 1);
344 src3 = __lasx_vext2xv_hu_bu(
src1);
345 sum0 = __lasx_xvadd_h(
src2, src3);
346 sum0 = __lasx_xvaddi_hu(sum0, 1);
348 for (
int i= 0;
i <
h;
i++) {
350 src0 = __lasx_xvld(pixels, 0);
351 src1 = __lasx_xvld(pixels, 1);
354 src3 = __lasx_vext2xv_hu_bu(
src1);
355 sum1 = __lasx_xvadd_h(
src2, src3);
356 sum2 = __lasx_xvadd_h(sum0, sum1);
357 sum2 = __lasx_xvsrani_b_h(sum2, sum2, 2);
359 sum0 = __lasx_xvaddi_hu(sum1, 1);
360 __lasx_xvstelm_d(sum2,
block, 0, 0);
361 __lasx_xvstelm_d(sum2,
block, 8, 3);
372 ptrdiff_t line_size,
int h)
375 int32_t stride2x = line_size << 1;
376 int32_t stride3x = stride2x + line_size;
377 int32_t stride4x = line_size << 2;
378 uint8_t *
_src = (uint8_t*)pixels + 1;
381 for (
int i = 0;
i < h_4;
i++) {
382 src0 = __lasx_xvld(pixels, 0);
384 src3 = __lasx_xvldx(pixels, stride3x);
385 src4 = __lasx_xvld(
_src, 0);
387 src7 = __lasx_xvldx(
_src, stride3x);
411 ptrdiff_t line_size,
int h)
414 int32_t stride2x = line_size << 1;
415 int32_t stride4x = line_size << 2;
416 int32_t stride3x = stride2x + line_size;
417 uint8_t*
_src = (uint8_t*)pixels;
420 for (
int i = 0;
i < h_4;
i++) {
442 ptrdiff_t line_size,
int h)
445 __m128i sum0, sum1, sum2;
447 src0 = __lsx_vld(pixels, 0);
448 src1 = __lsx_vld(pixels, 1);
449 src2 = __lsx_vsllwil_hu_bu(
src0, 0);
450 src3 = __lsx_vsllwil_hu_bu(
src1, 0);
451 sum0 = __lsx_vadd_h(
src2, src3);
452 sum0 = __lsx_vaddi_hu(sum0, 1);
454 for (
int i = 0;
i <
h;
i++) {
456 src0 = __lsx_vld(pixels, 0);
457 src1 = __lsx_vld(pixels, 1);
458 src2 = __lsx_vsllwil_hu_bu(
src0, 0);
459 src3 = __lsx_vsllwil_hu_bu(
src1, 0);
460 sum1 = __lsx_vadd_h(
src2, src3);
461 sum2 = __lsx_vadd_h(sum0, sum1);
462 sum2 = __lsx_vsrani_b_h(sum2, sum2, 2);
464 sum0 = __lsx_vaddi_hu(sum1, 1);
465 __lsx_vstelm_d(sum2,
block, 0, 0);
472 ptrdiff_t line_size,
int h)
475 __m256i sum0, sum1, sum2;
477 src0 = __lasx_xvld(pixels, 0);
478 src1 = __lasx_xvld(pixels, 1);
480 src3 = __lasx_vext2xv_hu_bu(
src1);
481 sum0 = __lasx_xvadd_h(
src2, src3);
482 sum0 = __lasx_xvaddi_hu(sum0, 2);
484 for (
int i = 0;
i <
h;
i++) {
486 src0 = __lasx_xvld(pixels, 0);
487 src1 = __lasx_xvld(pixels, 1);
490 src3 = __lasx_vext2xv_hu_bu(
src1);
491 sum1 = __lasx_xvadd_h(
src2, src3);
492 sum2 = __lasx_xvadd_h(sum0, sum1);
493 sum2 = __lasx_xvsrani_b_h(sum2, sum2, 2);
494 sum0 = __lasx_xvaddi_hu(sum1, 2);
495 __lasx_xvstelm_d(sum2,
block, 0, 0);
496 __lasx_xvstelm_d(sum2,
block, 8, 3);
506 __m256i src8, src9, sum0, sum1;
508 int32_t src_stride_2x = src_stride << 1;
509 int32_t src_stride_4x = src_stride << 2;
510 int32_t dst_stride_2x = dst_stride << 1;
511 int32_t dst_stride_4x = dst_stride << 2;
512 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
513 int32_t src_stride_3x = src_stride_2x + src_stride;
519 for (loop_cnt = (
height >> 2); loop_cnt--;) {
522 src4 = __lasx_xvldx(
_src, src_stride_3x);
524 src6 = __lasx_xvld(
_src, 0);
526 src9 = __lasx_xvldx(
_src, src_stride_3x);
527 _src += (src_stride_4x - 1);
530 src5 = __lasx_xvilvl_b(src9, src4);
536 sum0 = __lasx_xvsrarni_b_h(sum1, sum0, 2);
537 __lasx_xvstelm_d(sum0,
dst, 0, 0);
538 __lasx_xvstelm_d(sum0,
dst + dst_stride, 0, 2);
539 __lasx_xvstelm_d(sum0,
dst + dst_stride_2x, 0, 1);
540 __lasx_xvstelm_d(sum0,
dst + dst_stride_3x, 0, 3);
541 dst += dst_stride_4x;
548 ptrdiff_t line_size,
int h)