30 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
31 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
33 if (0 == height % 12) {
34 for (cnt = (height / 12); cnt--;) {
36 src0, src1, src2, src3, src4, src5, src6, src7);
37 src += (8 * src_stride);
39 out0 = __msa_copy_u_d((v2i64) src0, 0);
40 out1 = __msa_copy_u_d((v2i64) src1, 0);
41 out2 = __msa_copy_u_d((v2i64) src2, 0);
42 out3 = __msa_copy_u_d((v2i64) src3, 0);
43 out4 = __msa_copy_u_d((v2i64) src4, 0);
44 out5 = __msa_copy_u_d((v2i64) src5, 0);
45 out6 = __msa_copy_u_d((v2i64) src6, 0);
46 out7 = __msa_copy_u_d((v2i64) src7, 0);
48 SD4(out0, out1, out2, out3, dst, dst_stride);
49 dst += (4 * dst_stride);
50 SD4(out4, out5, out6, out7, dst, dst_stride);
51 dst += (4 * dst_stride);
53 LD_UB4(src, src_stride, src0, src1, src2, src3);
54 src += (4 * src_stride);
56 out0 = __msa_copy_u_d((v2i64) src0, 0);
57 out1 = __msa_copy_u_d((v2i64) src1, 0);
58 out2 = __msa_copy_u_d((v2i64) src2, 0);
59 out3 = __msa_copy_u_d((v2i64) src3, 0);
61 SD4(out0, out1, out2, out3, dst, dst_stride);
62 dst += (4 * dst_stride);
64 }
else if (0 == height % 8) {
65 for (cnt = height >> 3; cnt--;) {
67 src0, src1, src2, src3, src4, src5, src6, src7);
68 src += (8 * src_stride);
70 out0 = __msa_copy_u_d((v2i64) src0, 0);
71 out1 = __msa_copy_u_d((v2i64) src1, 0);
72 out2 = __msa_copy_u_d((v2i64) src2, 0);
73 out3 = __msa_copy_u_d((v2i64) src3, 0);
74 out4 = __msa_copy_u_d((v2i64) src4, 0);
75 out5 = __msa_copy_u_d((v2i64) src5, 0);
76 out6 = __msa_copy_u_d((v2i64) src6, 0);
77 out7 = __msa_copy_u_d((v2i64) src7, 0);
79 SD4(out0, out1, out2, out3, dst, dst_stride);
80 dst += (4 * dst_stride);
81 SD4(out4, out5, out6, out7, dst, dst_stride);
82 dst += (4 * dst_stride);
84 }
else if (0 == height % 4) {
85 for (cnt = (height / 4); cnt--;) {
86 LD_UB4(src, src_stride, src0, src1, src2, src3);
87 src += (4 * src_stride);
88 out0 = __msa_copy_u_d((v2i64) src0, 0);
89 out1 = __msa_copy_u_d((v2i64) src1, 0);
90 out2 = __msa_copy_u_d((v2i64) src2, 0);
91 out3 = __msa_copy_u_d((v2i64) src3, 0);
93 SD4(out0, out1, out2, out3, dst, dst_stride);
94 dst += (4 * dst_stride);
96 }
else if (0 == height % 2) {
97 for (cnt = (height / 2); cnt--;) {
98 LD_UB2(src, src_stride, src0, src1);
99 src += (2 * src_stride);
100 out0 = __msa_copy_u_d((v2i64) src0, 0);
101 out1 = __msa_copy_u_d((v2i64) src1, 0);
115 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
117 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
118 src += (8 * src_stride);
119 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
120 dst += (8 * dst_stride);
121 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
122 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
131 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
133 for (cnt = (width >> 4); cnt--;) {
137 for (loop_cnt = (height >> 3); loop_cnt--;) {
138 LD_UB8(src_tmp, src_stride,
139 src0, src1, src2, src3, src4, src5, src6, src7);
140 src_tmp += (8 * src_stride);
142 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
143 dst_tmp, dst_stride);
144 dst_tmp += (8 * dst_stride);
157 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
159 if (0 == height % 12) {
160 for (cnt = (height / 12); cnt--;) {
162 src0, src1, src2, src3, src4, src5, src6, src7);
163 src += (8 * src_stride);
164 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
166 dst += (8 * dst_stride);
168 LD_UB4(src, src_stride, src0, src1, src2, src3);
169 src += (4 * src_stride);
170 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
171 dst += (4 * dst_stride);
173 }
else if (0 == height % 8) {
175 }
else if (0 == height % 4) {
176 for (cnt = (height >> 2); cnt--;) {
177 LD_UB4(src, src_stride, src0, src1, src2, src3);
178 src += (4 * src_stride);
180 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
181 dst += (4 * dst_stride);
199 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
201 if (0 == height % 12) {
202 for (cnt = (height / 12); cnt--;) {
203 LD_UB4(src, src_stride, src0, src1, src2, src3);
204 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
205 src += (4 * src_stride);
206 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
207 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
208 dst += (4 * dst_stride);
210 LD_UB4(src, src_stride, src0, src1, src2, src3);
211 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
212 src += (4 * src_stride);
213 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
214 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
217 LD_UB4(src, src_stride, src0, src1, src2, src3);
218 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
219 src += (4 * src_stride);
220 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
221 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
222 dst += (4 * dst_stride);
224 }
else if (0 == height % 8) {
226 }
else if (0 == height % 4) {
227 for (cnt = (height >> 2); cnt--;) {
228 LD_UB4(src, src_stride, src0, src1, src2, src3);
229 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
230 src += (4 * src_stride);
231 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
232 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
233 dst += (4 * dst_stride);
254 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
256 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
258 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
261 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
262 filt0, filt1, filt2, filt3) \
266 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
267 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
268 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
269 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
270 tmp0 = __msa_adds_s_h(tmp0, tmp1); \
275 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
276 mask0, mask1, mask2, mask3, \
277 filt0, filt1, filt2, filt3, \
280 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
281 v8i16 res0_m, res1_m, res2_m, res3_m; \
283 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
284 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
285 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
286 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
287 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
288 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
289 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
290 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
291 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
294 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
295 mask0, mask1, mask2, mask3, \
296 filt0, filt1, filt2, filt3, \
297 out0, out1, out2, out3) \
299 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
300 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
302 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
303 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
304 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
305 res0_m, res1_m, res2_m, res3_m); \
306 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
307 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
308 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
309 res4_m, res5_m, res6_m, res7_m); \
310 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
311 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
312 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
313 res0_m, res1_m, res2_m, res3_m); \
314 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
315 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
316 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
317 res4_m, res5_m, res6_m, res7_m); \
318 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
319 res7_m, out0, out1, out2, out3); \
322 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
326 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
327 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
332 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
333 mask0, mask1, filt0, filt1, \
336 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
338 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
339 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
340 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
341 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
344 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
345 mask0, mask1, filt0, filt1, \
346 out0, out1, out2, out3) \
348 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
350 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
351 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
352 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
353 out0, out1, out2, out3); \
354 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
355 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
356 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
357 out0, out1, out2, out3); \
364 v16u8 mask0, mask1, mask2, mask3,
out;
365 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
366 v8i16
filt, out0, out1;
371 rnd_vec = __msa_fill_h(rnd_val);
374 filt =
LD_SH(filter);
375 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
381 LD_SB4(src, src_stride, src0, src1, src2, src3);
384 mask3, filt0, filt1, filt2, filt3, out0, out1);
388 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
395 v16i8 filt0, filt1, filt2, filt3;
397 v16u8 mask0, mask1, mask2, mask3,
out;
398 v8i16
filt, out0, out1, out2, out3;
403 rnd_vec = __msa_fill_h(rnd_val);
406 filt =
LD_SH(filter);
407 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
413 LD_SB4(src, src_stride, src0, src1, src2, src3);
415 src += (4 * src_stride);
417 mask3, filt0, filt1, filt2, filt3, out0, out1);
418 LD_SB4(src, src_stride, src0, src1, src2, src3);
421 mask3, filt0, filt1, filt2, filt3, out2, out3);
425 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
426 dst += (4 * dst_stride);
428 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
435 v16u8 mask0, mask1, mask2, mask3,
out;
436 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
437 v8i16
filt, out0, out1, out2, out3;
442 rnd_vec = __msa_fill_h(rnd_val);
445 filt =
LD_SH(filter);
446 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
452 LD_SB4(src, src_stride, src0, src1, src2, src3);
454 src += (4 * src_stride);
456 mask3, filt0, filt1, filt2, filt3, out0, out1);
457 LD_SB4(src, src_stride, src0, src1, src2, src3);
459 src += (4 * src_stride);
461 mask3, filt0, filt1, filt2, filt3, out2, out3);
465 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
466 dst += (4 * dst_stride);
468 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
469 dst += (4 * dst_stride);
471 LD_SB4(src, src_stride, src0, src1, src2, src3);
473 src += (4 * src_stride);
475 mask3, filt0, filt1, filt2, filt3, out0, out1);
476 LD_SB4(src, src_stride, src0, src1, src2, src3);
478 src += (4 * src_stride);
480 mask3, filt0, filt1, filt2, filt3, out2, out3);
485 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
486 dst += (4 * dst_stride);
488 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
497 }
else if (8 == height) {
499 }
else if (16 == height) {
509 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
510 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
511 v8i16
filt, out0, out1, out2, out3;
516 rnd_vec = __msa_fill_h(rnd_val);
519 filt =
LD_SH(filter);
520 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
526 LD_SB4(src, src_stride, src0, src1, src2, src3);
529 mask3, filt0, filt1, filt2, filt3, out0, out1,
535 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
544 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
545 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
546 v8i16
filt, out0, out1, out2, out3;
551 rnd_vec = __msa_fill_h(rnd_val);
554 filt =
LD_SH(filter);
555 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
561 for (loop_cnt = (height >> 2); loop_cnt--;) {
562 LD_SB4(src, src_stride, src0, src1, src2, src3);
564 src += (4 * src_stride);
566 mask3, filt0, filt1, filt2, filt3, out0,
572 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
573 dst += (4 * dst_stride);
597 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
598 v8i16
filt, out0, out1, out2, out3;
599 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
604 rnd_vec = __msa_fill_h(rnd_val);
613 filt =
LD_SH(filter);
614 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
623 for (loop_cnt = (height >> 2); loop_cnt--;) {
625 LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
627 src1_ptr += (4 * src_stride);
629 mask3, filt0, filt1, filt2, filt3, out0,
635 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
636 dst1 += (4 * dst_stride);
639 LD_SB4(src, src_stride, src0, src1, src2, src3);
641 src += (4 * src_stride);
643 mask6, filt0, filt1, filt2, filt3, out0,
648 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
649 dst += (4 * dst_stride);
659 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
660 v16u8 mask0, mask1, mask2, mask3,
out;
661 v8i16
filt, out0, out1, out2, out3;
666 rnd_vec = __msa_fill_h(rnd_val);
669 filt =
LD_SH(filter);
670 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
676 for (loop_cnt = (height >> 1); loop_cnt--;) {
677 LD_SB2(src, src_stride, src0, src2);
678 LD_SB2(src + 8, src_stride, src1, src3);
680 src += (2 * src_stride);
682 mask3, filt0, filt1, filt2, filt3, out0,
701 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
702 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
703 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
705 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
711 rnd_vec = __msa_fill_h(rnd_val);
714 filt =
LD_SH(filter);
715 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
725 for (loop_cnt = (height >> 1); loop_cnt--;) {
726 LD_SB2(src, src_stride, src0, src2);
727 LD_SB2(src + 16, src_stride, src1, src3);
729 src += (2 * src_stride);
730 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
731 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
732 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
733 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
736 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
737 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
738 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
739 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
742 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
743 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
744 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
745 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
746 out0, out8, out2, out9);
748 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
749 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
750 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
751 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
752 out4, out10, out6, out11);
754 ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
762 ST8x2_UB(out, dst + 16, dst_stride);
778 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
779 v16u8 mask0, mask1, mask2, mask3,
out;
780 v8i16
filt, out0, out1, out2, out3;
785 rnd_vec = __msa_fill_h(rnd_val);
788 filt =
LD_SH(filter);
789 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
795 for (loop_cnt = (height >> 1); loop_cnt--;) {
797 src2 =
LD_SB(src + 16);
798 src3 =
LD_SB(src + 24);
799 src1 = __msa_sldi_b(src2, src0, 8);
803 mask3, filt0, filt1, filt2, filt3, out0,
809 src2 =
LD_SB(src + 16);
810 src3 =
LD_SB(src + 24);
811 src1 = __msa_sldi_b(src2, src0, 8);
817 ST_UB(out, dst + 16);
822 mask3, filt0, filt1, filt2, filt3, out0,
829 ST_UB(out, dst + 16);
840 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
841 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
842 v8i16
filt, out0, out1, out2, out3, out4, out5, out6;
847 rnd_vec = __msa_fill_h(rnd_val);
850 filt =
LD_SH(filter);
851 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
861 for (loop_cnt = height; loop_cnt--;) {
862 LD_SB3(src, 16, src0, src2, src3);
863 src1 = __msa_sldi_b(src2, src0, 8);
866 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
868 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
869 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
872 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
873 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
875 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
876 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
879 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
881 out2 = __msa_adds_s_h(out2, out5);
883 out6 = __msa_srar_h(out2, rnd_vec);
888 src1 =
LD_SB(src + 40);
890 src1 = (v16i8) __msa_xori_b((v16u8)
src1, 128);
892 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
894 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
895 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
898 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
899 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
901 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
902 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
905 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
907 out5 = __msa_adds_s_h(out2, out5);
911 ST_UB(out, dst + 16);
913 ST_UB(out, dst + 32);
924 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
925 v16u8 mask0, mask1, mask2, mask3,
out;
926 v8i16
filt, out0, out1, out2, out3;
931 rnd_vec = __msa_fill_h(rnd_val);
934 filt =
LD_SH(filter);
935 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
941 for (loop_cnt = height; loop_cnt--;) {
943 src2 =
LD_SB(src + 16);
944 src3 =
LD_SB(src + 24);
945 src1 = __msa_sldi_b(src2, src0, 8);
949 mask2, mask3, filt0, filt1, filt2, filt3,
950 out0, out1, out2, out3);
956 ST_UB(out, dst + 16);
958 src0 =
LD_SB(src + 32);
959 src2 =
LD_SB(src + 48);
960 src3 =
LD_SB(src + 56);
961 src1 = __msa_sldi_b(src2, src0, 8);
966 mask2, mask3, filt0, filt1, filt2, filt3,
967 out0, out1, out2, out3);
971 ST_UB(out, dst + 32);
973 ST_UB(out, dst + 48);
984 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
985 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
986 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
987 v16i8 src10998, filt0, filt1, filt2, filt3;
989 v8i16
filt, out10, out32;
992 src -= (3 * src_stride);
993 rnd_vec = __msa_fill_h(rnd_val);
995 filt =
LD_SH(filter);
996 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
998 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
999 src += (7 * src_stride);
1001 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1003 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1004 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1008 for (loop_cnt = (height >> 2); loop_cnt--;) {
1009 LD_SB4(src, src_stride, src7, src8, src9, src10);
1010 src += (4 * src_stride);
1012 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1013 src87_r, src98_r, src109_r);
1014 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1017 filt1, filt2, filt3);
1019 filt1, filt2, filt3);
1023 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1024 dst += (4 * dst_stride);
1039 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1043 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
1046 src -= (3 * src_stride);
1047 rnd_vec = __msa_fill_h(rnd_val);
1049 filt =
LD_SH(filter);
1050 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1052 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1054 src += (7 * src_stride);
1055 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1059 for (loop_cnt = (height >> 2); loop_cnt--;) {
1060 LD_SB4(src, src_stride, src7, src8, src9, src10);
1062 src += (4 * src_stride);
1064 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1065 src87_r, src98_r, src109_r);
1067 filt1, filt2, filt3);
1069 filt1, filt2, filt3);
1071 filt1, filt2, filt3);
1073 filt1, filt2, filt3);
1074 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1075 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1078 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1079 dst += (4 * dst_stride);
1097 uint32_t out2, out3;
1098 uint64_t out0, out1;
1099 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1100 v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1101 v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1102 v8i16
filt, filt0, filt1, filt2, filt3;
1104 v4i32
mask = { 2, 6, 2, 6 };
1106 src -= (3 * src_stride);
1107 rnd_vec = __msa_fill_h(rnd_val);
1110 filt =
LD_SH(filter);
1111 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1113 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1114 src += (7 * src_stride);
1119 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1120 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1121 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1123 for (loop_cnt = (height >> 1); loop_cnt--;) {
1124 LD_SB2(src, src_stride, src7, src8);
1126 src += (2 * src_stride);
1128 ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1129 vec01, vec23, vec45, vec67);
1132 ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1138 VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1139 ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1145 PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1148 out0 = __msa_copy_u_d((v2i64) res0, 0);
1149 out1 = __msa_copy_u_d((v2i64) res1, 0);
1150 out2 = __msa_copy_u_w((v4i32) res2, 0);
1151 out3 = __msa_copy_u_w((v4i32) res2, 1);
1153 SW(out2, (dst + 8));
1156 SW(out3, (dst + 8));
1181 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1182 v16i8 filt0, filt1, filt2, filt3;
1183 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1184 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1185 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1186 v16u8 tmp0, tmp1, tmp2, tmp3;
1187 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1190 src -= (3 * src_stride);
1191 rnd_vec = __msa_fill_h(rnd_val);
1193 filt =
LD_SH(filter);
1194 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1196 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1198 src += (7 * src_stride);
1199 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1201 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1202 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1204 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1206 for (loop_cnt = (height >> 2); loop_cnt--;) {
1207 LD_SB4(src, src_stride, src7, src8, src9, src10);
1209 src += (4 * src_stride);
1211 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1212 src87_r, src98_r, src109_r);
1213 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1214 src87_l, src98_l, src109_l);
1216 filt1, filt2, filt3);
1218 filt1, filt2, filt3);
1220 filt1, filt2, filt3);
1222 filt1, filt2, filt3);
1224 filt1, filt2, filt3);
1226 filt1, filt2, filt3);
1228 filt1, filt2, filt3);
1230 filt1, filt2, filt3);
1231 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1232 SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
1233 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1234 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1235 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1236 out3_r, tmp0, tmp1, tmp2, tmp3);
1238 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1239 dst += (4 * dst_stride);
1264 uint32_t loop_cnt, cnt;
1265 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1266 v16i8 filt0, filt1, filt2, filt3;
1267 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1268 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1269 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1270 v16u8 tmp0, tmp1, tmp2, tmp3;
1271 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1274 src -= (3 * src_stride);
1275 rnd_vec = __msa_fill_h(rnd_val);
1277 filt =
LD_SH(filter);
1278 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1280 for (cnt = (width >> 4); cnt--;) {
1284 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1286 src_tmp += (7 * src_stride);
1287 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1288 src32_r, src54_r, src21_r);
1289 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1290 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1291 src32_l, src54_l, src21_l);
1292 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1294 for (loop_cnt = (height >> 2); loop_cnt--;) {
1295 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1297 src_tmp += (4 * src_stride);
1298 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1299 src87_r, src98_r, src109_r);
1300 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1301 src87_l, src98_l, src109_l);
1303 filt0, filt1, filt2, filt3);
1305 filt0, filt1, filt2, filt3);
1307 filt0, filt1, filt2, filt3);
1309 filt0, filt1, filt2, filt3);
1311 filt0, filt1, filt2, filt3);
1313 filt0, filt1, filt2, filt3);
1315 filt0, filt1, filt2, filt3);
1317 filt0, filt1, filt2, filt3);
1318 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1319 SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
1320 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1321 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1322 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1323 out3_r, tmp0, tmp1, tmp2, tmp3);
1325 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1326 dst_tmp += (4 * dst_stride);
1387 const int8_t *filter_x,
1388 const int8_t *filter_y,
1392 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1393 v8i16 filt0, filt1, filt2, filt3;
1394 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1395 v16i8 mask1, mask2, mask3;
1396 v8i16 filter_vec, const_vec;
1397 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1398 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1399 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1400 v4i32 dst0_r, dst1_r;
1401 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1402 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1403 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1404 v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1406 src -= ((3 * src_stride) + 3);
1407 filter_vec =
LD_SH(filter_x);
1408 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1410 filter_vec =
LD_SH(filter_y);
1411 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1412 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1414 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1420 const_vec = __msa_ldi_h(128);
1423 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1424 src += (7 * src_stride);
1427 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1428 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1429 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1430 vec8, vec9, vec10, vec11);
1431 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1432 vec12, vec13, vec14, vec15);
1435 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1436 dst30, dst30, dst30, dst30);
1438 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1439 dst41, dst41, dst41, dst41);
1441 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1442 dst52, dst52, dst52, dst52);
1444 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1445 dst63, dst63, dst63, dst63);
1447 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1448 dst10_r, dst21_r, dst32_r);
1449 dst43_r = __msa_ilvl_h(dst41, dst30);
1450 dst54_r = __msa_ilvl_h(dst52, dst41);
1451 dst65_r = __msa_ilvl_h(dst63, dst52);
1452 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1454 for (loop_cnt = height >> 1; loop_cnt--;) {
1455 LD_SB2(src, src_stride, src7, src8);
1456 src += 2 * src_stride;
1459 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1460 vec0, vec1, vec2, vec3);
1462 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1463 dst87, dst87, dst87, dst87);
1465 dst76_r = __msa_ilvr_h(dst87, dst66);
1467 filt_h0, filt_h1, filt_h2, filt_h3);
1468 dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1470 filt_h0, filt_h1, filt_h2, filt_h3);
1480 dst += (2 * dst_stride);
1488 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1496 const int8_t *filter_x,
1497 const int8_t *filter_y,
1500 uint32_t loop_cnt, cnt;
1503 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1504 v8i16 filt0, filt1, filt2, filt3;
1505 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1506 v16i8 mask1, mask2, mask3;
1507 v8i16 filter_vec, const_vec;
1508 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1509 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1510 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1511 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1512 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1513 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1514 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1515 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1516 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1518 src -= ((3 * src_stride) + 3);
1519 const_vec = __msa_ldi_h(128);
1522 filter_vec =
LD_SH(filter_x);
1523 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1525 filter_vec =
LD_SH(filter_y);
1526 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1527 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1529 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1535 for (cnt = width >> 3; cnt--;) {
1539 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1540 src_tmp += (7 * src_stride);
1544 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1545 vec0, vec1, vec2, vec3);
1546 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1547 vec4, vec5, vec6, vec7);
1548 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1549 vec8, vec9, vec10, vec11);
1550 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1551 vec12, vec13, vec14, vec15);
1553 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1554 dst0, dst0, dst0, dst0);
1556 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1557 dst1, dst1, dst1, dst1);
1559 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1560 dst2, dst2, dst2, dst2);
1562 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1563 dst3, dst3, dst3, dst3);
1565 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1566 vec0, vec1, vec2, vec3);
1567 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1568 vec4, vec5, vec6, vec7);
1569 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1570 vec8, vec9, vec10, vec11);
1572 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1573 dst4, dst4, dst4, dst4);
1575 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1576 dst5, dst5, dst5, dst5);
1578 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1579 dst6, dst6, dst6, dst6);
1581 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1582 dst10_r, dst32_r, dst54_r, dst21_r);
1583 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1584 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1585 dst10_l, dst32_l, dst54_l, dst21_l);
1586 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1588 for (loop_cnt = height >> 1; loop_cnt--;) {
1589 LD_SB2(src_tmp, src_stride, src7, src8);
1591 src_tmp += 2 * src_stride;
1593 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1594 vec0, vec1, vec2, vec3);
1596 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1597 dst7, dst7, dst7, dst7);
1601 filt_h0, filt_h1, filt_h2, filt_h3);
1603 filt_h0, filt_h1, filt_h2, filt_h3);
1607 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1608 vec0, vec1, vec2, vec3);
1610 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1611 dst8, dst8, dst8, dst8);
1615 filt_h0, filt_h1, filt_h2, filt_h3);
1617 filt_h0, filt_h1, filt_h2, filt_h3);
1627 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1628 dst_tmp += (2 * dst_stride);
1654 const int8_t *filter_x,
1655 const int8_t *filter_y,
1659 filter_x, filter_y, height, 8);
1666 const int8_t *filter_x,
1667 const int8_t *filter_y,
1671 filter_x, filter_y, height, 8);
1674 filter_x, filter_y, height);
1681 const int8_t *filter_x,
1682 const int8_t *filter_y,
1686 filter_x, filter_y, height, 16);
1693 const int8_t *filter_x,
1694 const int8_t *filter_y,
1698 filter_x, filter_y, height, 24);
1705 const int8_t *filter_x,
1706 const int8_t *filter_y,
1710 filter_x, filter_y, height, 32);
1717 const int8_t *filter_x,
1718 const int8_t *filter_y,
1722 filter_x, filter_y, height, 48);
1729 const int8_t *filter_x,
1730 const int8_t *filter_y,
1734 filter_x, filter_y, height, 64);
1741 v16i8 filt0, filt1,
src0,
src1, mask0, mask1, vec0, vec1;
1748 rnd_vec = __msa_fill_h(rnd_val);
1751 filt =
LD_SH(filter);
1756 LD_SB2(src, src_stride, src0, src1);
1758 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1760 res0 = __msa_srar_h(res0, rnd_vec);
1761 res0 = __msa_sat_s_h(res0, 7);
1770 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1771 v8i16
filt, out0, out1;
1777 rnd_vec = __msa_fill_h(rnd_val);
1780 filt =
LD_SH(filter);
1785 LD_SB4(src, src_stride, src0, src1, src2, src3);
1788 filt0, filt1, out0, out1);
1792 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1799 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1801 v8i16
filt, out0, out1, out2, out3;
1806 rnd_vec = __msa_fill_h(rnd_val);
1809 filt =
LD_SH(filter);
1814 LD_SB4(src, src_stride, src0, src1, src2, src3);
1815 src += (4 * src_stride);
1819 filt0, filt1, out0, out1);
1820 LD_SB4(src, src_stride, src0, src1, src2, src3);
1823 filt0, filt1, out2, out3);
1827 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1828 dst += (4 * dst_stride);
1830 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1837 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1838 v16i8 filt0, filt1, mask0, mask1;
1840 v8i16
filt, out0, out1, out2, out3;
1845 rnd_vec = __msa_fill_h(rnd_val);
1848 filt =
LD_SH(filter);
1853 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1854 src += (8 * src_stride);
1857 filt0, filt1, out0, out1);
1859 filt0, filt1, out2, out3);
1863 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1864 dst += (4 * dst_stride);
1866 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1867 dst += (4 * dst_stride);
1869 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1870 src += (8 * src_stride);
1873 filt0, filt1, out0, out1);
1875 filt0, filt1, out2, out3);
1879 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1880 dst += (4 * dst_stride);
1882 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1892 }
else if (4 == height) {
1894 }
else if (8 == height) {
1896 }
else if (16 == height) {
1908 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1910 v8i16
filt, out0, out1, out2, out3;
1915 rnd_vec = __msa_fill_h(rnd_val);
1918 filt =
LD_SH(filter);
1923 for (loop_cnt = (height >> 2); loop_cnt--;) {
1924 LD_SB4(src, src_stride, src0, src1, src2, src3);
1925 src += (4 * src_stride);
1929 filt1, out0, out1, out2, out3);
1935 ST6x4_UB(out4, out5, dst, dst_stride);
1936 dst += (4 * dst_stride);
1946 v16i8
src0,
src1, filt0, filt1, mask0, mask1;
1948 v8i16
filt, vec0, vec1, vec2, vec3;
1953 rnd_vec = __msa_fill_h(rnd_val);
1955 filt =
LD_SH(filter);
1960 for (loop_cnt = (height >> 1); loop_cnt--;) {
1961 LD_SB2(src, src_stride, src0, src1);
1962 src += (2 * src_stride);
1965 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1966 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1967 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1973 dst += (2 * dst_stride);
1983 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1985 v8i16
filt, out0, out1, out2, out3;
1990 rnd_vec = __msa_fill_h(rnd_val);
1993 filt =
LD_SH(filter);
1998 for (loop_cnt = (height >> 2); loop_cnt--;) {
1999 LD_SB4(src, src_stride, src0, src1, src2, src3);
2000 src += (4 * src_stride);
2004 filt1, out0, out1, out2, out3);
2009 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2010 dst += (4 * dst_stride);
2019 if ((2 == height) || (6 == height)) {
2034 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2035 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2038 v8i16
filt, out0, out1, out2, out3, out4, out5;
2047 filt =
LD_SH(filter);
2053 rnd_vec = __msa_fill_h(rnd_val);
2055 for (loop_cnt = (height >> 2); loop_cnt--;) {
2056 LD_SB4(src, src_stride, src0, src1, src2, src3);
2057 src += (4 * src_stride);
2060 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2061 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2062 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2063 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2064 out2, out3, out4, out5);
2065 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2066 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2067 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2068 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2069 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2070 out2, out3, out4, out5);
2078 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2080 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2081 dst += (4 * dst_stride);
2091 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2092 v16i8 filt0, filt1, mask0, mask1;
2093 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2099 rnd_vec = __msa_fill_h(rnd_val);
2102 filt =
LD_SH(filter);
2107 for (loop_cnt = (height >> 2); loop_cnt--;) {
2108 LD_SB4(src, src_stride, src0, src2, src4, src6);
2109 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2110 src += (4 * src_stride);
2114 filt1, out0, out1, out2, out3);
2116 filt1, out4, out5, out6, out7);
2143 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2144 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2145 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2146 v8i16
filt, out0, out1, out2, out3;
2152 rnd_vec = __msa_fill_h(rnd_val);
2155 filt =
LD_SH(filter);
2160 mask11 = mask0 + 10;
2162 for (loop_cnt = (height >> 2); loop_cnt--;) {
2163 LD_SB4(src, src_stride, src0, src2, src4, src6);
2164 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2165 src += (4 * src_stride);
2168 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2169 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2170 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2171 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2172 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2173 out0, out1, out2, out3);
2174 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2175 out0, out1, out2, out3);
2185 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2186 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2187 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2188 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2189 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2190 out0, out1, out2, out3);
2191 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2192 out0, out1, out2, out3);
2203 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2204 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2205 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2206 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2208 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2209 out0, out1, out2, out3);
2210 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2211 out0, out1, out2, out3);
2217 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2218 dst1 += (4 * dst_stride);
2228 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2229 v16i8 filt0, filt1, mask0, mask1;
2231 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2236 rnd_vec = __msa_fill_h(rnd_val);
2239 filt =
LD_SH(filter);
2244 for (loop_cnt = (height >> 1); loop_cnt--;) {
2246 src2 =
LD_SB(src + 16);
2247 src3 =
LD_SB(src + 24);
2250 src6 =
LD_SB(src + 16);
2251 src7 =
LD_SB(src + 24);
2252 SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2257 filt0, filt1, out0, out1, out2, out3);
2259 filt0, filt1, out4, out5, out6, out7);
2267 ST_UB(out, dst + 16);
2272 ST_UB(out, dst + 16);
2281 v16i8
src0,
src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2282 v16i8 src2110, src4332, filt0, filt1;
2288 rnd_vec = __msa_fill_h(rnd_val);
2290 filt =
LD_SH(filter);
2293 LD_SB3(src, src_stride, src0, src1, src2);
2294 src += (3 * src_stride);
2296 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2297 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2298 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2299 LD_SB2(src, src_stride, src3, src4);
2300 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2301 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2302 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2304 out10 = __msa_srar_h(out10, rnd_vec);
2305 out10 = __msa_sat_s_h(out10, 7);
2316 v16i8
src0,
src1, src2, src3, src4, src5;
2317 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2318 v16i8 src2110, src4332, filt0, filt1;
2319 v8i16
filt, out10, out32;
2324 rnd_vec = __msa_fill_h(rnd_val);
2326 filt =
LD_SH(filter);
2329 LD_SB3(src, src_stride, src0, src1, src2);
2330 src += (3 * src_stride);
2332 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2334 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2335 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2337 for (loop_cnt = (height >> 2); loop_cnt--;) {
2338 LD_SB3(src, src_stride, src3, src4, src5);
2339 src += (3 * src_stride);
2340 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2341 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2342 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2346 src += (src_stride);
2347 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2348 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2349 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2354 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2355 dst += (4 * dst_stride);
2378 v16u8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2379 v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2380 v8i16
filt, filt0, filt1;
2384 rnd_vec = __msa_fill_h(rnd_val);
2387 filt =
LD_SH(filter);
2390 LD_UB3(src, src_stride, src0, src1, src2);
2391 src += (3 * src_stride);
2393 vec0 = (v16u8) __msa_xori_b((v16u8)
src0, 128);
2394 vec1 = (v16u8) __msa_xori_b((v16u8)
src1, 128);
2395 vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2397 for (loop_cnt = (height >> 2); loop_cnt--;) {
2398 LD_UB4(src, src_stride, src3, src0, src1, src2);
2399 src += (4 * src_stride);
2401 vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2402 ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2405 vec0 = __msa_xori_b((v16u8) src0, 128);
2406 ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2409 vec1 = __msa_xori_b((v16u8) src1, 128);
2410 vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2413 vec2 = __msa_xori_b((v16u8) src2, 128);
2414 vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2421 ST6x4_UB(out0, out1, dst, dst_stride);
2422 dst += (4 * dst_stride);
2430 v16i8
src0,
src1, src2, src3, src4;
2431 v8i16 src01, src12, src23, src34, tmp0, tmp1,
filt, filt0, filt1;
2436 rnd_vec = __msa_fill_h(rnd_val);
2439 filt =
LD_SH(filter);
2442 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2444 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2446 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2459 uint64_t out0, out1, out2;
2460 v16i8
src0,
src1, src2, src3, src4, src5;
2461 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2462 v8i16
filt, filt0, filt1;
2466 rnd_vec = __msa_fill_h(rnd_val);
2469 filt =
LD_SH(filter);
2472 LD_SB3(src, src_stride, src0, src1, src2);
2473 src += (3 * src_stride);
2476 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2478 for (loop_cnt = 2; loop_cnt--;) {
2479 LD_SB3(src, src_stride, src3, src4, src5);
2480 src += (3 * src_stride);
2483 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2492 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2493 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2494 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2514 v16i8
src0,
src1, src2, src7, src8, src9, src10;
2515 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2517 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
2521 rnd_vec = __msa_fill_h(rnd_val);
2523 filt =
LD_SH(filter);
2526 LD_SB3(src, src_stride, src0, src1, src2);
2527 src += (3 * src_stride);
2530 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2532 for (loop_cnt = (height >> 2); loop_cnt--;) {
2533 LD_SB4(src, src_stride, src7, src8, src9, src10);
2534 src += (4 * src_stride);
2537 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2538 src72_r, src87_r, src98_r, src109_r);
2543 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2544 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2547 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2548 dst += (4 * dst_stride);
2563 }
else if (6 == height) {
2567 filter, height, rnd_val);
2577 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2578 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2580 v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2581 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
filt, filt0, filt1;
2582 v4u32
mask = { 2, 6, 2, 6 };
2586 filt =
LD_SH(filter);
2589 rnd_vec = __msa_fill_h(rnd_val);
2593 LD_SB3(src, src_stride, src0, src1, src2);
2594 src += (3 * src_stride);
2597 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2599 for (loop_cnt = (height >> 2); loop_cnt--;) {
2600 LD_SB4(src, src_stride, src3, src4, src5, src6);
2601 src += (4 * src_stride);
2604 ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2605 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2606 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2608 ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2609 src21, src43, src54, src65);
2613 ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2622 ST8x4_UB(out0, out1, dst, dst_stride);
2624 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2625 dst += (4 * dst_stride);
2642 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2643 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2644 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2645 v16u8 tmp0, tmp1, tmp2, tmp3;
2646 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2650 rnd_vec = __msa_fill_h(rnd_val);
2652 filt =
LD_SH(filter);
2655 LD_SB3(src, src_stride, src0, src1, src2);
2656 src += (3 * src_stride);
2659 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2660 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2662 for (loop_cnt = (height >> 2); loop_cnt--;) {
2663 LD_SB4(src, src_stride, src3, src4, src5, src6);
2664 src += (4 * src_stride);
2667 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2668 src32_r, src43_r, src54_r, src65_r);
2669 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2670 src32_l, src43_l, src54_l, src65_l);
2679 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2680 SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
2681 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2682 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2683 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2684 out3_r, tmp0, tmp1, tmp2, tmp3);
2686 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2687 dst += (4 * dst_stride);
2703 uint64_t out0, out1;
2704 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2705 v16i8 src11, filt0, filt1;
2706 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2707 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2709 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2714 filt =
LD_SH(filter);
2717 rnd_vec = __msa_fill_h(rnd_val);
2720 LD_SB3(src, src_stride, src0, src1, src2);
2722 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2726 LD_SB3(src + 16, src_stride, src6, src7, src8);
2727 src += (3 * src_stride);
2729 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2731 for (loop_cnt = (height >> 2); loop_cnt--;) {
2733 LD_SB2(src, src_stride, src3, src4);
2735 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2736 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2739 LD_SB2(src + 16, src_stride, src9, src10);
2740 src += (2 * src_stride);
2742 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2755 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2757 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2761 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2763 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2764 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2773 LD_SB2(src, src_stride, src5, src2);
2775 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2776 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2779 LD_SB2(src + 16, src_stride, src11, src8);
2780 src += (2 * src_stride);
2782 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2795 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2797 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2817 uint32_t loop_cnt, cnt;
2819 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
2820 v16i8 src10_r, src32_r, src76_r, src98_r;
2821 v16i8 src21_r, src43_r, src87_r, src109_r;
2822 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2823 v16i8 src10_l, src32_l, src76_l, src98_l;
2824 v16i8 src21_l, src43_l, src87_l, src109_l;
2831 rnd_vec = __msa_fill_h(rnd_val);
2833 filt =
LD_SH(filter);
2836 for (cnt = (width >> 5); cnt--;) {
2841 LD_SB3(src_tmp, src_stride, src0, src1, src2);
2844 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2845 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2849 src_tmp += (3 * src_stride);
2852 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2853 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2855 for (loop_cnt = (height >> 1); loop_cnt--;) {
2857 LD_SB2(src_tmp, src_stride, src3, src4);
2859 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2860 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2869 SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec);
2870 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2872 ST_UB(out, dst_tmp);
2874 ST_UB(out, dst_tmp + dst_stride);
2883 LD_SB2(src_tmp + 16, src_stride, src9, src10);
2884 src_tmp += (2 * src_stride);
2886 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2887 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2896 SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec);
2897 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2899 ST_UB(out, dst_tmp + 16);
2901 ST_UB(out, dst_tmp + 16 + dst_stride);
2903 dst_tmp += 2 * dst_stride;
2923 filter, height, rnd_val, 32);
2930 const int8_t *filter_x,
2931 const int8_t *filter_y,
2934 v16i8
src0,
src1, src2, src3, src4;
2936 v4i32 filt_h0, filt_h1;
2937 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2939 v8i16 filter_vec, const_vec;
2940 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2941 v8i16 dst0, dst1, dst2, dst3, dst4;
2942 v4i32 dst0_r, dst1_r;
2943 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2945 src -= (src_stride + 1);
2947 filter_vec =
LD_SH(filter_x);
2950 filter_vec =
LD_SH(filter_y);
2951 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2952 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2958 const_vec = __msa_ldi_h(128);
2961 LD_SB3(src, src_stride, src0, src1, src2);
2962 src += (3 * src_stride);
2966 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2967 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2968 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2977 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2978 LD_SB2(src, src_stride, src3, src4);
2982 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2986 dst32_r = __msa_ilvr_h(dst3, dst2);
2991 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2995 dst43_r = __msa_ilvr_h(dst4, dst3);
2999 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3000 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
3002 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3011 const int8_t *filter_x,
3012 const int8_t *filter_y,
3015 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3017 v4i32 filt_h0, filt_h1;
3018 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3020 v8i16 filter_vec, const_vec;
3021 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3022 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3023 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3024 v8i16 out0_r, out1_r;
3025 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3027 src -= (src_stride + 1);
3029 filter_vec =
LD_SH(filter_x);
3032 filter_vec =
LD_SH(filter_y);
3033 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3034 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3040 const_vec = __msa_ldi_h(128);
3043 LD_SB3(src, src_stride, src0, src1, src2);
3044 src += (3 * src_stride);
3048 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3049 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3050 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3059 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3060 LD_SB4(src, src_stride, src3, src4, src5, src6);
3064 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3068 dst32_r = __msa_ilvr_h(dst3, dst2);
3073 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3077 dst43_r = __msa_ilvr_h(dst4, dst3);
3082 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3086 dst10_r = __msa_ilvr_h(dst5, dst4);
3091 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3095 dst21_r = __msa_ilvr_h(dst2, dst5);
3099 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
3102 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3104 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
3111 const int8_t *filter_x,
3112 const int8_t *filter_y,
3116 v16i8
src0,
src1, src2, src3, src4, src5;
3117 v16i8 src6, src7, src8, src9, src10;
3119 v4i32 filt_h0, filt_h1;
3120 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3122 v8i16 filter_vec, const_vec;
3123 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3124 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3125 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3126 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3127 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3128 v8i16 out0_r, out1_r, out2_r, out3_r;
3130 src -= (src_stride + 1);
3132 filter_vec =
LD_SH(filter_x);
3135 filter_vec =
LD_SH(filter_y);
3136 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3137 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3143 const_vec = __msa_ldi_h(128);
3146 LD_SB3(src, src_stride, src0, src1, src2);
3147 src += (3 * src_stride);
3151 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3152 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3153 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3162 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3164 for (loop_cnt = height >> 3; loop_cnt--;) {
3166 src3, src4, src5, src6, src7, src8, src9, src10);
3167 src += (8 * src_stride);
3172 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3176 dst32_r = __msa_ilvr_h(dst3, dst2);
3181 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3185 dst43_r = __msa_ilvr_h(dst4, dst3);
3190 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3194 dst54_r = __msa_ilvr_h(dst5, dst4);
3199 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3203 dst65_r = __msa_ilvr_h(dst6, dst5);
3208 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3212 dst76_r = __msa_ilvr_h(dst7, dst6);
3217 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3221 dst87_r = __msa_ilvr_h(dst8, dst7);
3226 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3230 dst10_r = __msa_ilvr_h(dst9, dst8);
3235 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3239 dst21_r = __msa_ilvr_h(dst2, dst9);
3244 dst5_r, dst4_r, dst7_r, dst6_r,
3245 out0_r, out1_r, out2_r, out3_r);
3250 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3251 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3252 dst += (8 * dst_stride);
3260 const int8_t *filter_x,
3261 const int8_t *filter_y,
3266 filter_x, filter_y, height);
3267 }
else if (4 == height) {
3269 filter_x, filter_y, height);
3270 }
else if (0 == (height % 8)) {
3272 filter_x, filter_y, height);
3280 const int8_t *filter_x,
3281 const int8_t *filter_y,
3285 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3287 v4i32 filt_h0, filt_h1;
3288 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3290 v8i16 filter_vec, const_vec;
3291 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3292 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3293 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3294 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3295 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3296 v8i16 out0_r, out1_r, out2_r, out3_r;
3298 src -= (src_stride + 1);
3300 filter_vec =
LD_SH(filter_x);
3303 filter_vec =
LD_SH(filter_y);
3304 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3305 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3311 const_vec = __msa_ldi_h(128);
3314 LD_SB3(src, src_stride, src0, src1, src2);
3315 src += (3 * src_stride);
3319 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3320 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3321 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3333 for (loop_cnt = height >> 2; loop_cnt--;) {
3334 LD_SB4(src, src_stride, src3, src4, src5, src6);
3335 src += (4 * src_stride);
3340 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3351 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3362 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3374 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3386 dst2_l, dst2_r, dst3_l, dst3_r,
3387 out0_r, out1_r, out2_r, out3_r);
3392 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3393 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3394 dst += (4 * dst_stride);
3402 const int8_t *filter_x,
3403 const int8_t *filter_y,
3406 v16i8
src0,
src1, src2, src3, src4;
3408 v4i32 filt_h0, filt_h1;
3409 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3411 v8i16 filter_vec, const_vec;
3412 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3413 v8i16 dst0, dst1, dst2, dst3, dst4;
3414 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3415 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3416 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3417 v8i16 out0_r, out1_r;
3419 src -= (src_stride + 1);
3421 filter_vec =
LD_SH(filter_x);
3424 filter_vec =
LD_SH(filter_y);
3425 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3426 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3432 const_vec = __msa_ldi_h(128);
3435 LD_SB3(src, src_stride, src0, src1, src2);
3436 src += (3 * src_stride);
3440 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3441 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3442 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3454 LD_SB2(src, src_stride, src3, src4);
3458 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3469 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3479 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3482 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3491 const int8_t *filter_x,
3492 const int8_t *filter_y,
3495 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3497 v4i32 filt_h0, filt_h1;
3498 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3500 v8i16 filter_vec, const_vec;
3501 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3503 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3504 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3505 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3506 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3507 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3508 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3509 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3511 src -= (src_stride + 1);
3513 filter_vec =
LD_SH(filter_x);
3516 filter_vec =
LD_SH(filter_y);
3517 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3518 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3524 const_vec = __msa_ldi_h(128);
3527 LD_SB3(src, src_stride, src0, src1, src2);
3528 src += (3 * src_stride);
3532 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3533 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3534 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3546 LD_SB2(src, src_stride, src3, src4);
3547 src += (2 * src_stride);
3552 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3564 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3574 LD_SB2(src, src_stride, src5, src6);
3575 src += (2 * src_stride);
3580 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3591 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3601 LD_SB2(src, src_stride, src7, src8);
3602 src += (2 * src_stride);
3607 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3619 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3630 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3631 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3637 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3638 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3640 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3641 dst += (4 * dst_stride);
3649 const int8_t *filter_x,
3650 const int8_t *filter_y,
3654 uint32_t loop_cnt, cnt;
3657 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3659 v4i32 filt_h0, filt_h1;
3660 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3662 v8i16 filter_vec, const_vec;
3663 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3664 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3665 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3666 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3667 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3668 v8i16 out0_r, out1_r, out2_r, out3_r;
3670 src -= (src_stride + 1);
3672 filter_vec =
LD_SH(filter_x);
3675 filter_vec =
LD_SH(filter_y);
3676 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3677 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3683 const_vec = __msa_ldi_h(128);
3686 for (cnt = width >> 3; cnt--;) {
3690 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3691 src_tmp += (3 * src_stride);
3695 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3696 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3697 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3709 for (loop_cnt = height >> 2; loop_cnt--;) {
3710 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3711 src_tmp += (4 * src_stride);
3716 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3728 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3739 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3751 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3763 dst2_l, dst2_r, dst3_l, dst3_r,
3764 out0_r, out1_r, out2_r, out3_r);
3769 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3770 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3771 dst_tmp += (4 * dst_stride);
3783 const int8_t *filter_x,
3784 const int8_t *filter_y,
3789 filter_x, filter_y, height);
3790 }
else if (6 == height) {
3792 filter_x, filter_y, height);
3793 }
else if (0 == (height % 4)) {
3795 filter_x, filter_y, height, 8);
3803 const int8_t *filter_x,
3804 const int8_t *filter_y,
3808 filter_x, filter_y, height, 8);
3811 filter_x, filter_y, height);
3818 const int8_t *filter_x,
3819 const int8_t *filter_y,
3823 filter_x, filter_y, height, 16);
3830 const int8_t *filter_x,
3831 const int8_t *filter_y,
3835 filter_x, filter_y, height, 24);
3842 const int8_t *filter_x,
3843 const int8_t *filter_y,
3847 filter_x, filter_y, height, 32);
3850 #define UNI_MC_COPY(WIDTH) \
3851 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3852 ptrdiff_t dst_stride, \
3854 ptrdiff_t src_stride, \
3860 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3873 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3874 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3885 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3887 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3888 filter, height, 6); \
3891 UNI_MC(qpel,
h, 4, 8, hz, mx);
3892 UNI_MC(qpel,
h, 8, 8, hz, mx);
3893 UNI_MC(qpel,
h, 12, 8, hz, mx);
3894 UNI_MC(qpel,
h, 16, 8, hz, mx);
3895 UNI_MC(qpel,
h, 24, 8, hz, mx);
3896 UNI_MC(qpel,
h, 32, 8, hz, mx);
3897 UNI_MC(qpel,
h, 48, 8, hz, mx);
3898 UNI_MC(qpel,
h, 64, 8, hz, mx);
3900 UNI_MC(qpel, v, 4, 8, vt, my);
3901 UNI_MC(qpel, v, 8, 8, vt, my);
3902 UNI_MC(qpel, v, 12, 8, vt, my);
3903 UNI_MC(qpel, v, 16, 8, vt, my);
3904 UNI_MC(qpel, v, 24, 8, vt, my);
3905 UNI_MC(qpel, v, 32, 8, vt, my);
3906 UNI_MC(qpel, v, 48, 8, vt, my);
3907 UNI_MC(qpel, v, 64, 8, vt, my);
3909 UNI_MC(epel,
h, 4, 4, hz, mx);
3910 UNI_MC(epel,
h, 6, 4, hz, mx);
3911 UNI_MC(epel,
h, 8, 4, hz, mx);
3912 UNI_MC(epel,
h, 12, 4, hz, mx);
3913 UNI_MC(epel,
h, 16, 4, hz, mx);
3914 UNI_MC(epel,
h, 24, 4, hz, mx);
3915 UNI_MC(epel,
h, 32, 4, hz, mx);
3917 UNI_MC(epel, v, 4, 4, vt, my);
3918 UNI_MC(epel, v, 6, 4, vt, my);
3919 UNI_MC(epel, v, 8, 4, vt, my);
3920 UNI_MC(epel, v, 12, 4, vt, my);
3921 UNI_MC(epel, v, 16, 4, vt, my);
3922 UNI_MC(epel, v, 24, 4, vt, my);
3923 UNI_MC(epel, v, 32, 4, vt, my);
3927 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3928 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3939 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3940 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3942 hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3943 dst_stride, filter_x, \
3944 filter_y, height); \
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val, int32_t width)
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static const uint8_t mc_filt_mask_arr[16 *3]
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define XORI_B2_128_SH(...)
#define XORI_B4_128_UB(...)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH_0_255(in)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
#define CLIP_SW_0_255(in)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH2_0_255(in0, in1)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static const uint16_t mask[17]
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B7_128_SB(...)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B4_128_SB(...)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define DPADD_SB2_SH(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define SPLATI_W4_SW(...)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val, int32_t width)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_H4_SB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static const int8_t filt[NUMTAPS]
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define UNI_MC_COPY(WIDTH)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SPLATI_W2_SW(...)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST8x1_UB(in, pdst)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
#define ST4x2_UB(in, pdst, stride)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)