27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
34 v4i32 out0_r, out1_r, out0_l, out1_l; \
36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45 CLIP_SW4_0_255(out0_l, out0_r, out1_l, out1_r); \
46 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50 wgt, rnd, offset, out0, out1, out2, out3) \
52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
59 v4i32 out0_r, out1_r, out0_l, out1_l; \
61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68 CLIP_SW4_0_255(out0_r, out1_r, out0_l, out1_l); \
69 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73 vec3, wgt, rnd, offset, out0, out1, \
76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
84 const int16_t *src1_ptr,
95 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96 uint64_t tpd0, tpd1, tpd2, tpd3;
101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102 v8i16 dst0, dst1, dst2, dst3, weight_vec;
103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
105 offset = (offset0 + offset1) << rnd_val;
106 weight0 = weight0 & 0x0000FFFF;
107 weight = weight0 | (weight1 << 16);
109 offset_vec = __msa_fill_w(
offset);
110 weight_vec = (v8i16) __msa_fill_w(
weight);
111 rnd_vec = __msa_fill_w(rnd_val + 1);
114 LW2(src0_ptr, src_stride, tp0, tp1);
116 LD2(src1_ptr, src2_stride, tpd0, tpd1);
119 dst0 = (v8i16) __msa_ilvr_b(
zero,
src0);
123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
127 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
139 offset_vec, dst0, dst1);
140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141 ST_W4(out0, 0, 1, 2, 3,
dst, dst_stride);
142 }
else if (0 ==
height % 8) {
143 for (loop_cnt = (
height >> 3); loop_cnt--;) {
144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += 4 * src_stride;
147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += 4 * src_stride;
150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151 src1_ptr += (4 * src2_stride);
154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155 src1_ptr += (4 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 in3, weight_vec, rnd_vec, offset_vec,
163 dst0, dst1, dst2, dst3);
165 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
166 dst += (8 * dst_stride);
173 const int16_t *src1_ptr,
187 uint64_t tp0, tp1, tp2, tp3;
191 v8i16 in0, in1, in2, in3;
192 v8i16 dst0, dst1, dst2, dst3;
193 v4i32 offset_vec, weight_vec, rnd_vec;
195 offset = (offset0 + offset1) << rnd_val;
196 weight0 = weight0 & 0x0000FFFF;
197 weight = weight0 | (weight1 << 16);
199 weight_vec = __msa_fill_w(
weight);
200 offset_vec = __msa_fill_w(
offset);
201 rnd_vec = __msa_fill_w(rnd_val + 1);
203 for (loop_cnt = (
height >> 2); loop_cnt--;) {
204 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
205 src0_ptr += (4 * src_stride);
208 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
209 src1_ptr += (4 * src2_stride);
212 SLLI_4V(dst0, dst1, dst2, dst3, 6);
215 weight_vec, rnd_vec, offset_vec,
216 dst0, dst1, dst2, dst3);
219 ST_H2(out0, 2, 6,
dst + 4, dst_stride);
220 ST_W2(out1, 0, 2,
dst + 2 * dst_stride, dst_stride);
221 ST_H2(out1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
222 dst += (4 * dst_stride);
225 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
226 src0_ptr += (4 * src_stride);
229 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
230 src1_ptr += (4 * src2_stride);
233 SLLI_4V(dst0, dst1, dst2, dst3, 6);
236 weight_vec, rnd_vec, offset_vec,
237 dst0, dst1, dst2, dst3);
241 ST_H2(out0, 2, 6,
dst + 4, dst_stride);
242 ST_W2(out1, 0, 2,
dst + 2 * dst_stride, dst_stride);
243 ST_H2(out1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
249 const int16_t *src1_ptr,
260 uint64_t tp0, tp1, tp2, tp3;
262 v16u8 out0, out1, out2;
265 v8i16 in0, in1, in2, in3, in4, in5;
266 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
267 v4i32 offset_vec, weight_vec, rnd_vec;
269 offset = (offset0 + offset1) << rnd_val;
270 weight0 = weight0 & 0x0000FFFF;
271 weight = weight0 | (weight1 << 16);
273 offset_vec = __msa_fill_w(
offset);
274 weight_vec = __msa_fill_w(
weight);
275 rnd_vec = __msa_fill_w(rnd_val + 1);
278 LD2(src0_ptr, src_stride, tp0, tp1);
280 LD_SH2(src1_ptr, src2_stride, in0, in1);
285 weight_vec, rnd_vec, offset_vec,
288 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
291 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
292 src0_ptr += 4 * src_stride;
295 LD2(src0_ptr, src_stride, tp0, tp1);
300 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
301 SLLI_4V(dst0, dst1, dst2, dst3, 6);
304 weight_vec, rnd_vec, offset_vec, dst0, dst1,
307 offset_vec, dst4, dst5);
308 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
309 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
310 ST_D2(out2, 0, 1,
dst + 4 * dst_stride, dst_stride);
311 }
else if (0 ==
height % 4) {
314 for (loop_cnt = (
height >> 2); loop_cnt--;) {
315 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
316 src0_ptr += (4 * src_stride);
321 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
322 src1_ptr += (4 * src2_stride);
324 SLLI_4V(dst0, dst1, dst2, dst3, 6);
326 in3, weight_vec, rnd_vec, offset_vec,
327 dst0, dst1, dst2, dst3);
329 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
330 dst += (4 * dst_stride);
337 const int16_t *src1_ptr,
351 v16u8 out0, out1, out2;
353 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
355 v4i32 offset_vec, weight_vec, rnd_vec;
357 offset = (offset0 + offset1) << rnd_val;
358 weight0 = weight0 & 0x0000FFFF;
359 weight = weight0 | (weight1 << 16);
361 offset_vec = __msa_fill_w(
offset);
362 weight_vec = __msa_fill_w(
weight);
363 rnd_vec = __msa_fill_w(rnd_val + 1);
365 for (loop_cnt = (
height >> 2); loop_cnt--;) {
367 src0_ptr += (4 * src_stride);
368 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
369 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
370 src1_ptr += (4 * src2_stride);
374 dst0, dst1, dst2, dst3);
376 SLLI_4V(dst0, dst1, dst2, dst3, 6);
383 weight_vec, rnd_vec, offset_vec, dst0, dst1,
386 offset_vec, dst4, dst5);
387 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
388 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
389 ST_W4(out2, 0, 1, 2, 3,
dst + 8, dst_stride);
390 dst += (4 * dst_stride);
396 const int16_t *src1_ptr,
409 v16u8 out0, out1, out2, out3;
412 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
413 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
414 v4i32 offset_vec, weight_vec, rnd_vec;
416 offset = (offset0 + offset1) << rnd_val;
417 weight0 = weight0 & 0x0000FFFF;
418 weight = weight0 | (weight1 << 16);
420 offset_vec = __msa_fill_w(
offset);
421 weight_vec = __msa_fill_w(
weight);
422 rnd_vec = __msa_fill_w(rnd_val + 1);
424 for (loop_cnt = (
height >> 2); loop_cnt--;) {
426 src0_ptr += (4 * src_stride);
427 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
428 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
429 src1_ptr += (4 * src2_stride);
430 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp0, tmp1,
432 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp4, tmp5,
434 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
435 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
437 weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
440 weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
444 ST_UB4(out0, out1, out2, out3,
dst, dst_stride);
445 dst += (4 * dst_stride);
451 const int16_t *src1_ptr,
464 v16u8 out0, out1, out2, out3, out4, out5;
466 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
467 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
468 v4i32 offset_vec, weight_vec, rnd_vec;
470 offset = (offset0 + offset1) << rnd_val;
471 weight0 = weight0 & 0x0000FFFF;
472 weight = weight0 | (weight1 << 16);
474 offset_vec = __msa_fill_w(
offset);
475 weight_vec = __msa_fill_w(
weight);
476 rnd_vec = __msa_fill_w(rnd_val + 1);
478 for (loop_cnt = 8; loop_cnt--;) {
480 LD_SB4(src0_ptr + 16, src_stride,
src2, src3, src6, src7);
481 src0_ptr += (4 * src_stride);
482 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
483 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
484 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
485 src1_ptr += (4 * src2_stride);
493 SLLI_4V(dst0, dst1, dst2, dst3, 6);
494 SLLI_4V(dst4, dst5, dst6, dst7, 6);
495 SLLI_4V(dst8, dst9, dst10, dst11, 6);
497 weight_vec, rnd_vec, offset_vec, dst0, dst1,
500 weight_vec, rnd_vec, offset_vec, dst4, dst5,
503 in11, weight_vec, rnd_vec, offset_vec,
504 dst8, dst9, dst10, dst11);
505 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
506 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
507 ST_UB4(out0, out1, out3, out4,
dst, dst_stride);
508 ST_D4(out2, out5, 0, 1, 0, 1,
dst + 16, dst_stride);
509 dst += (4 * dst_stride);
515 const int16_t *src1_ptr,
528 v16u8 out0, out1, out2, out3;
531 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
532 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
533 v4i32 offset_vec, weight_vec, rnd_vec;
535 offset = (offset0 + offset1) << rnd_val;
536 weight0 = weight0 & 0x0000FFFF;
537 weight = weight0 | (weight1 << 16);
539 offset_vec = __msa_fill_w(
offset);
540 weight_vec = __msa_fill_w(
weight);
541 rnd_vec = __msa_fill_w(rnd_val + 1);
543 for (loop_cnt = (
height >> 1); loop_cnt--;) {
545 src0_ptr += src_stride;
547 src0_ptr += src_stride;
548 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
549 src1_ptr += src2_stride;
550 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
551 src1_ptr += src2_stride;
557 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
558 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
560 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
563 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
576 const int16_t *src1_ptr,
589 v16u8 out0, out1, out2;
592 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
593 v4i32 offset_vec, weight_vec, rnd_vec;
595 offset = (offset0 + offset1) << rnd_val;
596 weight0 = weight0 & 0x0000FFFF;
597 weight = weight0 | (weight1 << 16);
599 offset_vec = __msa_fill_w(
offset);
600 weight_vec = __msa_fill_w(
weight);
601 rnd_vec = __msa_fill_w(rnd_val + 1);
603 for (loop_cnt = 64; loop_cnt--;) {
605 src0_ptr += src_stride;
606 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
607 src1_ptr += src2_stride;
612 SLLI_4V(dst0, dst1, dst2, dst3, 6);
615 weight_vec, rnd_vec, offset_vec, dst0, dst1,
618 offset_vec, dst4, dst5);
619 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
628 const int16_t *src1_ptr,
641 v16u8 out0, out1, out2, out3;
644 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
645 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
646 v4i32 offset_vec, weight_vec, rnd_vec;
648 offset = (offset0 + offset1) << rnd_val;
649 weight0 = weight0 & 0x0000FFFF;
650 weight = weight0 | (weight1 << 16);
652 offset_vec = __msa_fill_w(
offset);
653 weight_vec = __msa_fill_w(
weight);
654 rnd_vec = __msa_fill_w(rnd_val + 1);
656 for (loop_cnt =
height; loop_cnt--;) {
658 src0_ptr += src_stride;
659 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
660 src1_ptr += src2_stride;
662 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp0, tmp1,
664 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, tmp4, tmp5,
666 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
667 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
669 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
672 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
683 const int16_t *src1_ptr,
697 v8i16 filt0, filt1, filt2, filt3;
699 v16i8 mask1, mask2, mask3;
700 v16i8 vec0, vec1, vec2, vec3;
702 v8i16 in0, in1, in2, in3;
703 v8i16 filter_vec, out0, out1;
704 v4i32 weight_vec, offset_vec, rnd_vec;
709 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
715 offset = (offset0 + offset1) << rnd_val;
716 weight0 = weight0 & 0x0000FFFF;
717 weight = weight0 | (weight1 << 16);
718 constant = 128 * weight1;
722 offset_vec = __msa_fill_w(
offset);
723 weight_vec = __msa_fill_w(
weight);
724 rnd_vec = __msa_fill_w(rnd_val + 1);
726 for (loop_cnt = (
height >> 2); loop_cnt--;) {
728 src0_ptr += (4 * src_stride);
729 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
730 src1_ptr += (4 * src2_stride);
735 vec0, vec1, vec2, vec3);
739 vec0, vec1, vec2, vec3);
744 weight_vec, rnd_vec, offset_vec,
747 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
748 ST_W4(out0, 0, 1, 2, 3,
dst, dst_stride);
749 dst += (4 * dst_stride);
755 const int16_t *src1_ptr,
769 v8i16 filt0, filt1, filt2, filt3;
771 v16i8 mask1, mask2, mask3;
772 v16i8 vec0, vec1, vec2, vec3;
773 v8i16 dst0, dst1, dst2, dst3;
774 v8i16 in0, in1, in2, in3;
775 v8i16 filter_vec, out0, out1, out2, out3;
776 v4i32 weight_vec, offset_vec, rnd_vec;
780 offset = (offset0 + offset1) << rnd_val;
781 weight0 = weight0 & 0x0000FFFF;
782 weight = weight0 | (weight1 << 16);
783 constant = 128 * weight1;
787 offset_vec = __msa_fill_w(
offset);
788 weight_vec = __msa_fill_w(
weight);
789 rnd_vec = __msa_fill_w(rnd_val + 1);
792 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
798 for (loop_cnt = (
height >> 2); loop_cnt--;) {
800 src0_ptr += (4 * src_stride);
801 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
802 src1_ptr += (4 * src2_stride);
806 vec0, vec1, vec2, vec3);
810 vec0, vec1, vec2, vec3);
814 vec0, vec1, vec2, vec3);
817 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
818 vec0, vec1, vec2, vec3);
824 weight_vec, rnd_vec, offset_vec,
825 out0, out1, out2, out3);
828 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
829 dst += (4 * dst_stride);
835 const int16_t *src1_ptr,
850 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
851 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
852 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
853 v4i32 weight_vec, offset_vec, rnd_vec;
857 weight0 = weight0 & 0x0000FFFF;
858 weight = weight0 | (weight1 << 16);
859 constant = 128 * weight1;
861 offset = (offset0 + offset1) << rnd_val;
864 offset_vec = __msa_fill_w(
offset);
865 weight_vec = __msa_fill_w(
weight);
866 rnd_vec = __msa_fill_w(rnd_val + 1);
869 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
880 for (loop_cnt = 4; loop_cnt--;) {
882 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
896 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
901 weight_vec, rnd_vec, offset_vec, out0, out1, out2,
904 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
907 src0_ptr += (4 * src_stride);
908 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
909 src1_ptr += (4 * src2_stride);
916 VSHF_B4_SB(
src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
921 offset_vec, out0, out1);
922 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
923 ST_W4(out0, 0, 1, 2, 3,
dst + 8, dst_stride);
924 dst += (4 * dst_stride);
930 const int16_t *src1_ptr,
945 v8i16 in0, in1, in2, in3;
946 v8i16 filt0, filt1, filt2, filt3;
947 v16i8 mask1, mask2, mask3;
948 v8i16 filter_vec, out0, out1, out2, out3;
949 v16i8 vec0, vec1, vec2, vec3;
950 v8i16 dst0, dst1, dst2, dst3;
951 v4i32 weight_vec, offset_vec, rnd_vec;
952 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
955 offset = (offset0 + offset1) << rnd_val;
956 weight0 = weight0 & 0x0000FFFF;
957 weight = weight0 | (weight1 << 16);
958 constant = 128 * weight1;
962 offset_vec = __msa_fill_w(
offset);
963 weight_vec = __msa_fill_w(
weight);
964 rnd_vec = __msa_fill_w(rnd_val + 1);
967 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
973 for (loop_cnt = (
height >> 1); loop_cnt--;) {
975 src0_ptr += src_stride;
977 src0_ptr += src_stride;
978 LD_SH2(src1_ptr, 8, in0, in1);
979 src1_ptr += src2_stride;
980 LD_SH2(src1_ptr, 8, in2, in3);
981 src1_ptr += src2_stride;
985 vec0, vec1, vec2, vec3);
989 vec0, vec1, vec2, vec3);
993 vec0, vec1, vec2, vec3);
996 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
997 vec0, vec1, vec2, vec3);
1003 weight_vec, rnd_vec, offset_vec,
1004 out0, out1, out2, out3);
1008 dst += (2 * dst_stride);
1014 const int16_t *src1_ptr,
1030 v8i16 in0, in1, in2;
1031 v8i16 filt0, filt1, filt2, filt3;
1032 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1033 v16i8 vec0, vec1, vec2, vec3;
1034 v8i16 dst0, dst1, dst2;
1035 v4i32 dst2_r, dst2_l;
1036 v8i16 filter_vec, out0, out1, out2;
1037 v4i32 weight_vec, offset_vec, rnd_vec;
1040 src0_ptr = src0_ptr - 3;
1041 offset = (offset0 + offset1) << rnd_val;
1042 weight0 = weight0 & 0x0000FFFF;
1043 weight = weight0 | (weight1 << 16);
1044 constant = 128 * weight1;
1048 offset_vec = __msa_fill_w(
offset);
1049 weight_vec = __msa_fill_w(
weight);
1050 rnd_vec = __msa_fill_w(rnd_val + 1);
1053 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1064 src0_ptr += src_stride;
1065 LD_SH2(src1_ptr, 8, in0, in1);
1066 in2 =
LD_SH(src1_ptr + 16);
1067 src1_ptr += src2_stride;
1070 for (loop_cnt = 31; loop_cnt--;) {
1072 vec0, vec1, vec2, vec3);
1076 vec0, vec1, vec2, vec3);
1080 vec0, vec1, vec2, vec3);
1085 weight_vec, rnd_vec, offset_vec,
1089 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1090 (v8i16) weight_vec);
1091 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1092 (v8i16) weight_vec);
1095 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1098 src0_ptr += src_stride;
1099 LD_SH2(src1_ptr, 8, in0, in1);
1100 in2 =
LD_SH(src1_ptr + 16);
1101 src1_ptr += src2_stride;
1104 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1106 SD(dst_val0,
dst + 16);
1122 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1123 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1126 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1128 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1130 SD(dst_val0,
dst + 16);
1136 const int16_t *src1_ptr,
1151 v8i16 in0, in1, in2, in3;
1152 v8i16 filt0, filt1, filt2, filt3;
1154 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1155 v16i8 vec0, vec1, vec2, vec3;
1156 v8i16 dst0, dst1, dst2, dst3;
1157 v8i16 filter_vec, out0, out1, out2, out3;
1158 v4i32 weight_vec, offset_vec, rnd_vec;
1161 offset = (offset0 + offset1) << rnd_val;
1162 weight0 = weight0 & 0x0000FFFF;
1163 weight = weight0 | (weight1 << 16);
1164 constant = 128 * weight1;
1168 offset_vec = __msa_fill_w(
offset);
1169 weight_vec = __msa_fill_w(
weight);
1170 rnd_vec = __msa_fill_w(rnd_val + 1);
1173 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1183 for (loop_cnt =
height; loop_cnt--;) {
1186 src0_ptr += src_stride;
1187 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1188 src1_ptr += src2_stride;
1193 vec0, vec1, vec2, vec3);
1197 vec0, vec1, vec2, vec3);
1201 vec0, vec1, vec2, vec3);
1205 vec0, vec1, vec2, vec3);
1211 weight_vec, rnd_vec, offset_vec,
1212 out0, out1, out2, out3);
1222 const int16_t *src1_ptr,
1237 v8i16 in0, in1, in2, in3;
1238 v8i16 filt0, filt1, filt2, filt3;
1240 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1241 v16i8 vec0, vec1, vec2, vec3;
1242 v8i16 dst0, dst1, dst2, dst3;
1243 v8i16 filter_vec, out0, out1, out2, out3;
1244 v4i32 weight_vec, offset_vec, rnd_vec;
1247 offset = (offset0 + offset1) << rnd_val;
1248 weight0 = weight0 & 0x0000FFFF;
1249 weight = weight0 | (weight1 << 16);
1250 constant = 128 * weight1;
1254 offset_vec = __msa_fill_w(
offset);
1255 weight_vec = __msa_fill_w(
weight);
1256 rnd_vec = __msa_fill_w(rnd_val + 1);
1259 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1269 for (loop_cnt = 64; loop_cnt--;) {
1272 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1274 LD_SB2(src0_ptr + 32, 8, src3, src4);
1275 src0_ptr += src_stride;
1279 vec0, vec1, vec2, vec3);
1283 vec0, vec1, vec2, vec3);
1287 vec0, vec1, vec2, vec3);
1291 vec0, vec1, vec2, vec3);
1296 weight_vec, rnd_vec, offset_vec,
1297 out0, out1, out2, out3);
1302 LD_SH2(src1_ptr + 32, 8, in2, in3);
1303 src1_ptr += src2_stride;
1305 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1306 vec0, vec1, vec2, vec3);
1309 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1310 vec0, vec1, vec2, vec3);
1315 weight_vec, rnd_vec, offset_vec,
1318 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1326 const int16_t *src1_ptr,
1338 const uint8_t *src0_ptr_tmp;
1340 const int16_t *src1_ptr_tmp;
1341 uint32_t loop_cnt, cnt;
1344 v8i16 in0, in1, in2, in3;
1345 v8i16 filt0, filt1, filt2, filt3;
1347 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1348 v16i8 vec0, vec1, vec2, vec3;
1349 v8i16 dst0, dst1, dst2, dst3;
1350 v8i16 filter_vec, out0, out1, out2, out3;
1351 v4i32 weight_vec, offset_vec, rnd_vec;
1354 offset = (offset0 + offset1) << rnd_val;
1355 weight0 = weight0 & 0x0000FFFF;
1356 weight = weight0 | (weight1 << 16);
1357 constant = 128 * weight1;
1361 offset_vec = __msa_fill_w(
offset);
1362 weight_vec = __msa_fill_w(
weight);
1363 rnd_vec = __msa_fill_w(rnd_val + 1);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1376 for (loop_cnt =
height; loop_cnt--;) {
1377 src0_ptr_tmp = src0_ptr;
1379 src1_ptr_tmp = src1_ptr;
1381 for (cnt = 2; cnt--;) {
1385 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1390 vec0, vec1, vec2, vec3);
1394 vec0, vec1, vec2, vec3);
1398 vec0, vec1, vec2, vec3);
1402 vec0, vec1, vec2, vec3);
1408 weight_vec, rnd_vec, offset_vec,
1409 out0, out1, out2, out3);
1412 ST_SH2(out0, out1, dst_tmp, 16);
1416 src0_ptr += src_stride;
1417 src1_ptr += src2_stride;
1425 const int16_t *src1_ptr,
1440 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1441 v16i8 src11, src12, src13, src14;
1442 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1443 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1444 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1445 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1446 v16i8 src2110, src4332, src6554, src8776, src10998;
1447 v16i8 src12111110, src14131312;
1448 v8i16 dst10, dst32, dst54, dst76;
1449 v8i16 filt0, filt1, filt2, filt3;
1450 v8i16 filter_vec, out0, out1, out2, out3;
1451 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1453 src0_ptr -= (3 * src_stride);
1454 offset = (offset0 + offset1) << rnd_val;
1455 weight0 = weight0 & 0x0000FFFF;
1456 weight = weight0 | (weight1 << 16);
1458 const_vec = __msa_ldi_w(128);
1460 offset_vec = __msa_fill_w(
offset);
1461 weight_vec = __msa_fill_w(
weight);
1462 rnd_vec = __msa_fill_w(rnd_val + 1);
1463 weight1_vec = __msa_fill_w(weight1);
1464 offset_vec += const_vec * weight1_vec;
1467 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1470 src0_ptr += (7 * src_stride);
1473 src10_r, src32_r, src54_r, src21_r);
1474 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1475 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1476 src2110, src4332, src6554);
1479 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1480 LD_SB8(src0_ptr, src_stride,
1481 src7, src8, src9, src10, src11, src12, src13, src14);
1482 src0_ptr += (8 * src_stride);
1483 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1484 src1_ptr += (8 * src2_stride);
1488 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1489 src76_r, src87_r, src98_r, src109_r);
1490 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1491 src1110_r, src1211_r, src1312_r, src1413_r);
1492 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1493 src1413_r, src1312_r,
1494 src8776, src10998, src12111110, src14131312);
1497 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1498 filt0, dst10, dst32, dst54, dst76);
1499 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1500 filt1, dst10, dst32, dst54, dst76);
1501 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1502 filt2, filt2, dst10, dst32, dst54, dst76);
1503 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1504 filt3, filt3, dst10, dst32, dst54, dst76);
1508 weight_vec, rnd_vec, offset_vec,
1509 out0, out1, out2, out3);
1512 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
1513 dst += (8 * dst_stride);
1516 src4332 = src12111110;
1517 src6554 = src14131312;
1521 LD_SB8(src0_ptr, src_stride,
1522 src7, src8, src9, src10, src11, src12, src13, src14);
1523 src0_ptr += (8 * src_stride);
1524 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1525 src1_ptr += (8 * src2_stride);
1529 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1530 src76_r, src87_r, src98_r, src109_r);
1531 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1532 src1110_r, src1211_r, src1312_r, src1413_r);
1533 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1534 src1413_r, src1312_r,
1535 src8776, src10998, src12111110, src14131312);
1538 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1539 filt0, dst10, dst32, dst54, dst76);
1540 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1541 filt1, dst10, dst32, dst54, dst76);
1542 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1543 filt2, filt2, dst10, dst32, dst54, dst76);
1544 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1545 filt3, filt3, dst10, dst32, dst54, dst76);
1549 weight_vec, rnd_vec, offset_vec,
1550 out0, out1, out2, out3);
1553 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
1556 src4332 = src12111110;
1557 src6554 = src14131312;
1564 const int16_t *src1_ptr,
1579 v16i8 src6, src7, src8, src9, src10;
1580 v8i16 in0, in1, in2, in3;
1581 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1582 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1583 v8i16 tmp0, tmp1, tmp2, tmp3;
1584 v8i16 filt0, filt1, filt2, filt3;
1585 v8i16 filter_vec, out0, out1, out2, out3;
1586 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1588 src0_ptr -= (3 * src_stride);
1589 offset = (offset0 + offset1) << rnd_val;
1590 weight0 = weight0 & 0x0000FFFF;
1591 weight = weight0 | (weight1 << 16);
1593 const_vec = __msa_ldi_w(128);
1595 offset_vec = __msa_fill_w(
offset);
1596 weight_vec = __msa_fill_w(
weight);
1597 rnd_vec = __msa_fill_w(rnd_val + 1);
1598 weight1_vec = __msa_fill_w(weight1);
1599 offset_vec += const_vec * weight1_vec;
1602 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1605 src0_ptr += (7 * src_stride);
1609 src10_r, src32_r, src54_r, src21_r);
1610 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1612 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1613 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1614 src0_ptr += (4 * src_stride);
1615 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1616 src1_ptr += (4 * src2_stride);
1619 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1620 src76_r, src87_r, src98_r, src109_r);
1622 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1623 filt0, tmp0, tmp1, tmp2, tmp3);
1624 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1625 filt1, tmp0, tmp1, tmp2, tmp3);
1626 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1627 filt2, tmp0, tmp1, tmp2, tmp3);
1628 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1629 filt3, tmp0, tmp1, tmp2, tmp3);
1633 weight_vec, rnd_vec, offset_vec,
1634 out0, out1, out2, out3);
1637 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
1638 dst += (4 * dst_stride);
1652 const int16_t *src1_ptr,
1666 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1667 v8i16 in0, in1, in2, in3;
1668 v16i8 src10_r, src32_r, src54_r, src76_r;
1669 v16i8 src21_r, src43_r, src65_r, src87_r;
1670 v8i16 tmp0, tmp1, tmp2;
1671 v16i8 src10_l, src32_l, src54_l, src76_l;
1672 v16i8 src21_l, src43_l, src65_l, src87_l;
1673 v16i8 src2110, src4332, src6554, src8776;
1674 v8i16 filt0, filt1, filt2, filt3;
1675 v8i16 out0, out1, out2, filter_vec;
1676 v4i32 dst2_r, dst2_l;
1677 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1679 src0_ptr -= (3 * src_stride);
1680 offset = (offset0 + offset1) << rnd_val;
1681 weight0 = weight0 & 0x0000FFFF;
1682 weight = weight0 | (weight1 << 16);
1684 const_vec = __msa_ldi_w(128);
1686 offset_vec = __msa_fill_w(
offset);
1687 weight_vec = __msa_fill_w(
weight);
1688 rnd_vec = __msa_fill_w(rnd_val + 1);
1689 weight1_vec = __msa_fill_w(weight1);
1690 offset_vec += const_vec * weight1_vec;
1693 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1696 src0_ptr += (7 * src_stride);
1700 src10_r, src32_r, src54_r, src21_r);
1701 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1703 src10_l, src32_l, src54_l, src21_l);
1704 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1705 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1706 src2110, src4332, src6554);
1708 for (loop_cnt = 8; loop_cnt--;) {
1709 LD_SB2(src0_ptr, src_stride, src7, src8);
1710 src0_ptr += (2 * src_stride);
1711 LD_SH2(src1_ptr, src2_stride, in0, in1);
1712 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1713 src1_ptr += (2 * src2_stride);
1714 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1717 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1718 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1719 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1721 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1723 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1724 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1725 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1726 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1727 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1728 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1731 weight_vec, rnd_vec, offset_vec,
1735 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1736 (v8i16) weight_vec);
1737 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1738 (v8i16) weight_vec);
1741 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1743 ST_D2(out0, 0, 1,
dst, dst_stride);
1744 ST_W2(out2, 0, 1,
dst + 8, dst_stride);
1745 dst += (2 * dst_stride);
1762 const int16_t *src1_ptr,
1775 const uint8_t *src0_ptr_tmp;
1776 const int16_t *src1_ptr_tmp;
1778 uint32_t loop_cnt, cnt;
1780 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1781 v8i16 in0, in1, in2, in3;
1782 v16i8 src10_r, src32_r, src54_r, src76_r;
1783 v16i8 src21_r, src43_r, src65_r, src87_r;
1784 v16i8 src10_l, src32_l, src54_l, src76_l;
1785 v16i8 src21_l, src43_l, src65_l, src87_l;
1786 v8i16 tmp0, tmp1, tmp2, tmp3;
1787 v8i16 filt0, filt1, filt2, filt3;
1789 v8i16 out0, out1, out2, out3;
1790 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1792 src0_ptr -= (3 * src_stride);
1794 offset = (offset0 + offset1) << rnd_val;
1795 weight0 = weight0 & 0x0000FFFF;
1796 weight = weight0 | (weight1 << 16);
1798 const_vec = __msa_ldi_w(128);
1800 offset_vec = __msa_fill_w(
offset);
1801 weight_vec = __msa_fill_w(
weight);
1802 rnd_vec = __msa_fill_w(rnd_val + 1);
1803 weight1_vec = __msa_fill_w(weight1);
1804 offset_vec += const_vec * weight1_vec;
1807 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1809 for (cnt = (
width >> 4); cnt--;) {
1810 src0_ptr_tmp = src0_ptr;
1811 src1_ptr_tmp = src1_ptr;
1814 LD_SB7(src0_ptr_tmp, src_stride,
1816 src0_ptr_tmp += (7 * src_stride);
1820 src10_r, src32_r, src54_r, src21_r);
1821 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1823 src10_l, src32_l, src54_l, src21_l);
1824 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1826 for (loop_cnt = (
height >> 1); loop_cnt--;) {
1827 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1828 src0_ptr_tmp += (2 * src_stride);
1829 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1830 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1831 src1_ptr_tmp += (2 * src2_stride);
1834 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1835 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1837 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1838 filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1839 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1840 filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1841 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1842 filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1843 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1844 filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1848 weight_vec, rnd_vec, offset_vec,
1849 out0, out1, out2, out3);
1852 ST_SH2(out0, out1, dst_tmp, dst_stride);
1853 dst_tmp += (2 * dst_stride);
1878 const int16_t *src1_ptr,
1891 src1_ptr, src2_stride,
1893 weight0, weight1, offset0, offset1,
1899 const int16_t *src1_ptr,
1912 src1_ptr, src2_stride,
1914 weight0, weight1, offset0, offset1,
1917 src1_ptr + 16, src2_stride,
1919 weight0, weight1, offset0, offset1, rnd_val);
1924 const int16_t *src1_ptr,
1937 src1_ptr, src2_stride,
1939 weight0, weight1, offset0, offset1,
1945 const int16_t *src1_ptr,
1958 src1_ptr, src2_stride,
1960 weight0, weight1, offset0, offset1,
1966 const int16_t *src1_ptr,
1979 src1_ptr, src2_stride,
1981 weight0, weight1, offset0, offset1,
1987 const int16_t *src1_ptr,
1991 const int8_t *filter_x,
1992 const int8_t *filter_y,
2004 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2005 v8i16 in0 = { 0 }, in1 = { 0 };
2006 v8i16 filt0, filt1, filt2, filt3;
2007 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2008 v16i8 mask1, mask2, mask3;
2009 v8i16 filter_vec, weight_vec;
2010 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2011 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2012 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
2013 v8i16 tmp0, tmp1, tmp2, tmp3;
2014 v8i16 dst10, dst32, dst54, dst76;
2015 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
2016 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2019 src0_ptr -= ((3 * src_stride) + 3);
2021 filter_vec =
LD_SH(filter_x);
2022 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2024 filter_vec =
LD_SH(filter_y);
2027 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2033 offset = (offset0 + offset1) << rnd_val;
2034 weight0 = weight0 & 0x0000FFFF;
2035 weight = weight0 | (weight1 << 16);
2037 const_vec = __msa_fill_w((128 * weight1));
2039 offset_vec = __msa_fill_w(
offset);
2040 rnd_vec = __msa_fill_w(rnd_val + 1);
2041 offset_vec += const_vec;
2042 weight_vec = (v8i16) __msa_fill_w(
weight);
2045 src0_ptr += (7 * src_stride);
2049 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2050 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2052 vec8, vec9, vec10, vec11);
2053 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
2054 vec12, vec13, vec14, vec15);
2069 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2071 for (loop_cnt =
height >> 2; loop_cnt--;) {
2072 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2073 src0_ptr += (4 * src_stride);
2076 LD2(src1_ptr, src2_stride, tp0, tp1);
2078 src1_ptr += (2 * src2_stride);
2079 LD2(src1_ptr, src2_stride, tp0, tp1);
2081 src1_ptr += (2 * src2_stride);
2083 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2084 vec0, vec1, vec2, vec3);
2085 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2086 vec4, vec5, vec6, vec7);
2092 dst76 = __msa_ilvr_h(dst97, dst66);
2094 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2095 dst98 = __msa_ilvr_h(dst66, dst108);
2097 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2099 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2101 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2103 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2105 SRA_4V(dst0, dst1, dst2, dst3, 6);
2109 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2110 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2111 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2112 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2116 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2118 dst += (4 * dst_stride);
2126 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2132 const int16_t *src1_ptr,
2136 const int8_t *filter_x,
2137 const int8_t *filter_y,
2146 uint32_t loop_cnt, cnt;
2148 const uint8_t *src0_ptr_tmp;
2149 const int16_t *src1_ptr_tmp;
2152 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
2154 v8i16 filt0, filt1, filt2, filt3;
2155 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2157 v16i8 mask1, mask2, mask3;
2158 v8i16 filter_vec, weight_vec;
2159 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2160 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2161 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2162 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2163 v8i16 tmp0, tmp1, tmp2, tmp3;
2164 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2165 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2166 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2167 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2168 v4i32 offset_vec, rnd_vec, const_vec;
2170 src0_ptr -= ((3 * src_stride) + 3);
2172 offset = (offset0 + offset1) << rnd_val;
2173 weight0 = weight0 & 0x0000FFFF;
2174 weight = weight0 | (weight1 << 16);
2176 const_vec = __msa_fill_w((128 * weight1));
2178 offset_vec = __msa_fill_w(
offset);
2179 rnd_vec = __msa_fill_w(rnd_val + 1);
2180 offset_vec += const_vec;
2181 weight_vec = (v8i16) __msa_fill_w(
weight);
2183 filter_vec =
LD_SH(filter_x);
2184 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2186 filter_vec =
LD_SH(filter_y);
2189 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2195 for (cnt = width8mult; cnt--;) {
2196 src0_ptr_tmp = src0_ptr;
2197 src1_ptr_tmp = src1_ptr;
2200 LD_SB7(src0_ptr_tmp, src_stride,
2202 src0_ptr_tmp += (7 * src_stride);
2208 vec0, vec1, vec2, vec3);
2210 vec4, vec5, vec6, vec7);
2212 vec8, vec9, vec10, vec11);
2213 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2214 vec12, vec13, vec14, vec15);
2226 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2227 vec0, vec1, vec2, vec3);
2228 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2229 vec4, vec5, vec6, vec7);
2230 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2231 vec8, vec9, vec10, vec11);
2240 for (loop_cnt =
height >> 1; loop_cnt--;) {
2241 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2243 src0_ptr_tmp += 2 * src_stride;
2245 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2246 src1_ptr_tmp += (2 * src2_stride);
2248 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2249 dst32_r, dst54_r, dst21_r);
2250 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2251 dst32_l, dst54_l, dst21_l);
2252 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2253 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2255 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2256 vec0, vec1, vec2, vec3);
2262 filt_h0, filt_h1, filt_h2, filt_h3);
2264 filt_h0, filt_h1, filt_h2, filt_h3);
2270 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2271 vec0, vec1, vec2, vec3);
2277 filt_h0, filt_h1, filt_h2, filt_h3);
2279 filt_h0, filt_h1, filt_h2, filt_h3);
2284 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2287 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2288 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2289 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2290 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2291 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2293 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2294 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2295 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
2296 dst_tmp += (2 * dst_stride);
2315 const int16_t *src1_ptr,
2319 const int8_t *filter_x,
2320 const int8_t *filter_y,
2329 src1_ptr, src2_stride,
2330 dst, dst_stride, filter_x, filter_y,
2331 height, weight0, weight1, offset0,
2332 offset1, rnd_val, 1);
2337 const int16_t *src1_ptr,
2341 const int8_t *filter_x,
2342 const int8_t *filter_y,
2351 const uint8_t *src0_ptr_tmp;
2353 const int16_t *src1_ptr_tmp;
2357 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2358 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2359 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2360 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2361 v8i16 in0 = { 0 }, in1 = { 0 };
2362 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2363 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2364 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2365 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2366 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2367 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2368 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2369 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2371 src0_ptr -= ((3 * src_stride) + 3);
2373 offset = (offset0 + offset1) << rnd_val;
2374 weight0 = weight0 & 0x0000FFFF;
2375 weight = weight0 | (weight1 << 16);
2377 const_vec = __msa_fill_w((128 * weight1));
2379 offset_vec = __msa_fill_w(
offset);
2380 rnd_vec = __msa_fill_w(rnd_val + 1);
2381 offset_vec += const_vec;
2382 weight_vec = (v8i16) __msa_fill_w(
weight);
2384 filter_vec =
LD_SH(filter_x);
2385 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2387 filter_vec =
LD_SH(filter_y);
2390 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2397 src0_ptr_tmp = src0_ptr;
2398 src1_ptr_tmp = src1_ptr;
2402 src0_ptr_tmp += (7 * src_stride);
2409 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2419 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2420 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2421 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2430 for (loop_cnt = 8; loop_cnt--;) {
2431 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2432 src0_ptr_tmp += (2 * src_stride);
2435 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2436 src1_ptr_tmp += (2 * src2_stride);
2438 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2439 dst10_r, dst32_r, dst54_r, dst21_r);
2440 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2441 dst10_l, dst32_l, dst54_l, dst21_l);
2442 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2443 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2445 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2451 dst0 =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2452 filt_h1, filt_h2, filt_h3);
2453 dst1 =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2454 filt_h1, filt_h2, filt_h3);
2458 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2464 dst2 =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2465 filt_h1, filt_h2, filt_h3);
2466 dst3 =
HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2467 filt_h1, filt_h2, filt_h3);
2474 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2475 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2476 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2477 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2481 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2482 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
2483 dst_tmp += (2 * dst_stride);
2504 src0_ptr += (7 * src_stride);
2507 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2508 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2509 VSHF_B4_SB(
src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2511 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2525 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2527 for (loop_cnt = 4; loop_cnt--;) {
2528 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2529 src0_ptr += (4 * src_stride);
2532 LD2(src1_ptr, src2_stride, tp0, tp1);
2534 src1_ptr += (2 * src2_stride);
2535 LD2(src1_ptr, src2_stride, tp0, tp1);
2537 src1_ptr += (2 * src2_stride);
2539 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2541 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2548 dst76 = __msa_ilvr_h(dst97, dst66);
2550 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2551 dst98 = __msa_ilvr_h(dst66, dst108);
2553 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2555 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2557 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2559 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2561 SRA_4V(dst0, dst1, dst2, dst3, 6);
2565 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2566 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2567 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2568 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2572 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2574 dst += (4 * dst_stride);
2582 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2588 const int16_t *src1_ptr,
2592 const int8_t *filter_x,
2593 const int8_t *filter_y,
2602 src1_ptr, src2_stride,
2603 dst, dst_stride, filter_x, filter_y,
2604 height, weight0, weight1, offset0,
2605 offset1, rnd_val, 2);
2610 const int16_t *src1_ptr,
2614 const int8_t *filter_x,
2615 const int8_t *filter_y,
2624 src1_ptr, src2_stride,
2625 dst, dst_stride, filter_x, filter_y,
2626 height, weight0, weight1, offset0,
2627 offset1, rnd_val, 3);
2632 const int16_t *src1_ptr,
2636 const int8_t *filter_x,
2637 const int8_t *filter_y,
2646 src1_ptr, src2_stride,
2647 dst, dst_stride, filter_x, filter_y,
2648 height, weight0, weight1, offset0,
2649 offset1, rnd_val, 4);
2654 const int16_t *src1_ptr,
2658 const int8_t *filter_x,
2659 const int8_t *filter_y,
2668 src1_ptr, src2_stride,
2669 dst, dst_stride, filter_x, filter_y,
2670 height, weight0, weight1, offset0,
2671 offset1, rnd_val, 6);
2676 const int16_t *src1_ptr,
2680 const int8_t *filter_x,
2681 const int8_t *filter_y,
2690 src1_ptr, src2_stride,
2691 dst, dst_stride, filter_x, filter_y,
2692 height, weight0, weight1, offset0,
2693 offset1, rnd_val, 8);
2698 const int16_t *src1_ptr,
2714 v16i8 mask1, vec0, vec1;
2716 v4i32 dst0_r, dst0_l;
2717 v8i16 out0, filter_vec;
2718 v4i32 weight_vec, offset_vec, rnd_vec;
2727 offset = (offset0 + offset1) << rnd_val;
2728 weight0 = weight0 & 0x0000FFFF;
2729 weight = weight0 | (weight1 << 16);
2730 constant = 128 * weight1;
2734 offset_vec = __msa_fill_w(
offset);
2735 weight_vec = __msa_fill_w(
weight);
2736 rnd_vec = __msa_fill_w(rnd_val + 1);
2739 LD_SH2(src1_ptr, src2_stride, in0, in1);
2740 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2747 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2748 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2750 out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2752 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2753 ST_W2(out0, 0, 1,
dst, dst_stride);
2758 const int16_t *src1_ptr,
2776 v8i16 in0, in1, in2, in3;
2778 v4i32 weight_vec, offset_vec, rnd_vec;
2788 offset = (offset0 + offset1) << rnd_val;
2789 weight0 = weight0 & 0x0000FFFF;
2790 weight = weight0 | (weight1 << 16);
2791 constant = 128 * weight1;
2795 offset_vec = __msa_fill_w(
offset);
2796 weight_vec = __msa_fill_w(
weight);
2797 rnd_vec = __msa_fill_w(rnd_val + 1);
2801 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2809 weight_vec, rnd_vec, offset_vec,
2812 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2813 ST_W4(dst0, 0, 1, 2, 3,
dst, dst_stride);
2818 const int16_t *src1_ptr,
2837 v8i16 dst0, dst1, dst2, dst3;
2838 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2840 v4i32 weight_vec, offset_vec, rnd_vec;
2847 offset = (offset0 + offset1) << rnd_val;
2848 weight0 = weight0 & 0x0000FFFF;
2849 weight = weight0 | (weight1 << 16);
2850 constant = 128 * weight1;
2854 offset_vec = __msa_fill_w(
offset);
2855 weight_vec = __msa_fill_w(
weight);
2856 rnd_vec = __msa_fill_w(rnd_val + 1);
2860 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2861 LD_SB8(src0_ptr, src_stride,
2863 src0_ptr += (8 * src_stride);
2864 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2865 src1_ptr += (4 * src2_stride);
2866 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2867 src1_ptr += (4 * src2_stride);
2876 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2878 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2882 weight_vec, rnd_vec, offset_vec,
2883 dst0, dst1, dst2, dst3);
2886 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
2887 dst += (8 * dst_stride);
2893 const int16_t *src1_ptr,
2908 weight0, weight1, offset0, offset1, rnd_val);
2909 }
else if (4 ==
height) {
2912 weight0, weight1, offset0, offset1, rnd_val);
2913 }
else if (0 == (
height % 8)) {
2915 src1_ptr, src2_stride,
2917 weight0, weight1, offset0, offset1,
2924 const int16_t *src1_ptr,
2943 v8i16 in0, in1, in2, in3;
2944 v8i16 dst0, dst1, dst2, dst3;
2946 v4i32 weight_vec, offset_vec, rnd_vec;
2953 offset = (offset0 + offset1) << rnd_val;
2954 weight0 = weight0 & 0x0000FFFF;
2955 weight = weight0 | (weight1 << 16);
2956 constant = 128 * weight1;
2960 offset_vec = __msa_fill_w(
offset);
2961 weight_vec = __msa_fill_w(
weight);
2962 rnd_vec = __msa_fill_w(rnd_val + 1);
2966 for (loop_cnt = 2; loop_cnt--;) {
2968 src0_ptr += (4 * src_stride);
2969 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2970 src1_ptr += (4 * src2_stride);
2979 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2984 weight_vec, rnd_vec, offset_vec,
2985 dst0, dst1, dst2, dst3);
2988 ST_W2(dst0, 0, 2,
dst, dst_stride);
2989 ST_H2(dst0, 2, 6,
dst + 4, dst_stride);
2990 ST_W2(dst1, 0, 2,
dst + 2 * dst_stride, dst_stride);
2991 ST_H2(dst1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
2992 dst += (4 * dst_stride);
2998 const int16_t *src1_ptr,
3014 v16i8 mask1, vec0, vec1;
3017 v4i32 weight_vec, offset_vec, rnd_vec;
3024 offset = (offset0 + offset1) << rnd_val;
3025 weight0 = weight0 & 0x0000FFFF;
3026 weight = weight0 | (weight1 << 16);
3027 constant = 128 * weight1;
3031 offset_vec = __msa_fill_w(
offset);
3032 weight_vec = __msa_fill_w(
weight);
3033 rnd_vec = __msa_fill_w(rnd_val + 1);
3038 LD_SH2(src1_ptr, src2_stride, in0, in1);
3045 weight_vec, rnd_vec, offset_vec,
3048 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3049 ST_D2(dst0, 0, 1,
dst, dst_stride);
3054 const int16_t *src1_ptr,
3068 v8i16 in0, in1, in2, in3, in4, in5;
3072 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3074 v4i32 weight_vec, offset_vec, rnd_vec;
3081 offset = (offset0 + offset1) << rnd_val;
3082 weight0 = weight0 & 0x0000FFFF;
3083 weight = weight0 | (weight1 << 16);
3084 constant = 128 * weight1;
3088 offset_vec = __msa_fill_w(
offset);
3089 weight_vec = __msa_fill_w(
weight);
3090 rnd_vec = __msa_fill_w(rnd_val + 1);
3096 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3097 src1_ptr += (4 * src2_stride);
3098 LD_SH2(src1_ptr, src2_stride, in4, in5);
3106 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3108 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3110 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3114 weight_vec, rnd_vec, offset_vec,
3115 dst0, dst1, dst2, dst3);
3117 weight_vec, rnd_vec, offset_vec,
3121 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3122 ST_D4(dst0, dst1, 0, 1, 0, 1,
dst, dst_stride);
3123 ST_D2(dst3, 0, 1,
dst + 4 * dst_stride, dst_stride);
3128 const int16_t *src1_ptr,
3147 v8i16 in0, in1, in2, in3;
3148 v8i16 dst0, dst1, dst2, dst3;
3150 v4i32 weight_vec, offset_vec, rnd_vec;
3157 offset = (offset0 + offset1) << rnd_val;
3158 weight0 = weight0 & 0x0000FFFF;
3159 weight = weight0 | (weight1 << 16);
3160 constant = 128 * weight1;
3164 offset_vec = __msa_fill_w(
offset);
3165 weight_vec = __msa_fill_w(
weight);
3166 rnd_vec = __msa_fill_w(rnd_val + 1);
3170 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3172 src0_ptr += (4 * src_stride);
3173 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3174 src1_ptr += (4 * src2_stride);
3183 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3187 weight_vec, rnd_vec, offset_vec,
3188 dst0, dst1, dst2, dst3);
3191 ST_D4(dst0, dst1, 0, 1, 0, 1,
dst, dst_stride);
3192 dst += (4 * dst_stride);
3198 const int16_t *src1_ptr,
3213 weight0, weight1, offset0, offset1, rnd_val);
3214 }
else if (6 ==
height) {
3217 weight0, weight1, offset0, offset1, rnd_val);
3218 }
else if (0 == (
height % 4)) {
3220 src1_ptr, src2_stride,
3222 weight0, weight1, offset0, offset1,
3229 const int16_t *src1_ptr,
3245 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3248 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3252 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3254 v4i32 weight_vec, offset_vec, rnd_vec;
3261 offset = (offset0 + offset1) << rnd_val;
3262 weight0 = weight0 & 0x0000FFFF;
3263 weight = weight0 | (weight1 << 16);
3264 constant = 128 * weight1;
3268 offset_vec = __msa_fill_w(
offset);
3269 weight_vec = __msa_fill_w(
weight);
3270 rnd_vec = __msa_fill_w(rnd_val + 1);
3275 for (loop_cnt = 4; loop_cnt--;) {
3277 src0_ptr += (4 * src_stride);
3278 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3279 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3280 src1_ptr += (4 * src2_stride);
3290 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3299 weight_vec, rnd_vec, offset_vec,
3300 dst0, dst1, dst2, dst3);
3302 weight_vec, rnd_vec, offset_vec,
3306 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3307 ST_D4(dst0, dst1, 0, 1, 0, 1,
dst, dst_stride);
3308 ST_W4(dst3, 0, 1, 2, 3,
dst + 8, dst_stride);
3309 dst += (4 * dst_stride);
3315 const int16_t *src1_ptr,
3330 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3334 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3337 v4i32 weight_vec, offset_vec, rnd_vec;
3344 offset = (offset0 + offset1) << rnd_val;
3345 weight0 = weight0 & 0x0000FFFF;
3346 weight = weight0 | (weight1 << 16);
3347 constant = 128 * weight1;
3351 offset_vec = __msa_fill_w(
offset);
3352 weight_vec = __msa_fill_w(
weight);
3353 rnd_vec = __msa_fill_w(rnd_val + 1);
3357 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3359 LD_SB4(src0_ptr + 8, src_stride,
src1, src3, src5, src7);
3360 src0_ptr += (4 * src_stride);
3361 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3362 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3363 src1_ptr += (4 * src2_stride);
3372 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3374 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3376 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3378 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3380 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3384 weight_vec, rnd_vec, offset_vec,
3385 dst0, dst1, dst2, dst3);
3389 dst += (2 * dst_stride);
3393 weight_vec, rnd_vec, offset_vec,
3394 dst0, dst1, dst2, dst3);
3398 dst += (2 * dst_stride);
3404 const int16_t *src1_ptr,
3421 v16i8 mask1, mask2, mask3;
3423 v8i16 dst0, dst1, dst2, dst3;
3424 v8i16 in0, in1, in2, in3, in4, in5;
3426 v4i32 weight_vec, offset_vec, rnd_vec;
3433 offset = (offset0 + offset1) << rnd_val;
3434 weight0 = weight0 & 0x0000FFFF;
3435 weight = weight0 | (weight1 << 16);
3436 constant = 128 * weight1;
3440 offset_vec = __msa_fill_w(
offset);
3441 weight_vec = __msa_fill_w(
weight);
3442 rnd_vec = __msa_fill_w(rnd_val + 1);
3448 for (loop_cnt = 16; loop_cnt--;) {
3450 LD_SB2(src0_ptr + 16, src_stride,
src1, src3);
3451 src0_ptr += (2 * src_stride);
3452 LD_SH2(src1_ptr, src2_stride, in0, in2);
3453 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3454 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3455 src1_ptr += (2 * src2_stride);
3468 weight_vec, rnd_vec, offset_vec,
3469 dst0, dst1, dst2, dst3);
3477 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3480 weight_vec, rnd_vec, offset_vec,
3483 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3484 ST_D2(dst0, 0, 1, (
dst + 16), dst_stride);
3485 dst += (2 * dst_stride);
3491 const int16_t *src1_ptr,
3508 v16i8 mask1, mask2, mask3;
3509 v8i16 dst0, dst1, dst2, dst3;
3511 v8i16 in0, in1, in2, in3;
3513 v4i32 weight_vec, offset_vec, rnd_vec;
3520 offset = (offset0 + offset1) << rnd_val;
3521 weight0 = weight0 & 0x0000FFFF;
3522 weight = weight0 | (weight1 << 16);
3523 constant = 128 * weight1;
3527 offset_vec = __msa_fill_w(
offset);
3528 weight_vec = __msa_fill_w(
weight);
3529 rnd_vec = __msa_fill_w(rnd_val + 1);
3535 for (loop_cnt =
height; loop_cnt--;) {
3538 src0_ptr += src_stride;
3539 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3540 src1_ptr += src2_stride;
3553 weight_vec, rnd_vec, offset_vec,
3554 dst0, dst1, dst2, dst3);
3564 const int16_t *src1_ptr,
3577 v8i16 in0, in1, dst10;
3578 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3579 v4i32 dst10_r, dst10_l;
3581 v8i16 filter_vec,
out;
3582 v4i32 weight_vec, offset_vec, rnd_vec;
3584 src0_ptr -= src_stride;
3586 offset = (offset0 + offset1) << rnd_val;
3587 weight0 = weight0 & 0x0000FFFF;
3588 weight = weight0 | (weight1 << 16);
3589 constant = 128 * weight1;
3593 offset_vec = __msa_fill_w(
offset);
3594 weight_vec = __msa_fill_w(
weight);
3595 rnd_vec = __msa_fill_w(rnd_val + 1);
3601 src0_ptr += (3 * src_stride);
3603 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3604 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3605 LD_SB2(src0_ptr, src_stride, src3, src4);
3606 src0_ptr += (2 * src_stride);
3607 LD_SH2(src1_ptr, src2_stride, in0, in1);
3608 src1_ptr += (2 * src2_stride);
3610 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3612 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3613 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3618 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3619 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3622 out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3623 out = (v8i16) __msa_pckev_b((v16i8)
out, (v16i8)
out);
3629 const int16_t *src1_ptr,
3642 v8i16 in0, in1, in2, in3;
3643 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3644 v16i8 src2110, src4332, src6554;
3648 v4i32 weight_vec, offset_vec, rnd_vec;
3650 src0_ptr -= src_stride;
3652 offset = (offset0 + offset1) << rnd_val;
3653 weight0 = weight0 & 0x0000FFFF;
3654 weight = weight0 | (weight1 << 16);
3655 constant = 128 * weight1;
3659 offset_vec = __msa_fill_w(
offset);
3660 weight_vec = __msa_fill_w(
weight);
3661 rnd_vec = __msa_fill_w(rnd_val + 1);
3667 src0_ptr += (3 * src_stride);
3669 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3670 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3672 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3673 src0_ptr += (4 * src_stride);
3674 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3675 src1_ptr += (4 * src2_stride);
3678 src32_r, src43_r, src54_r, src65_r);
3679 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3686 weight_vec, rnd_vec, offset_vec,
3689 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3690 ST_W4(dst10, 0, 1, 2, 3,
dst, dst_stride);
3691 dst += (4 * dst_stride);
3696 const int16_t *src1_ptr,
3710 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
3711 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3712 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3713 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3714 v16i8 src2110, src4332, src6554, src8776;
3715 v8i16 dst10, dst32, dst54, dst76;
3718 v4i32 weight_vec, offset_vec, rnd_vec;
3720 src0_ptr -= src_stride;
3722 offset = (offset0 + offset1) << rnd_val;
3723 weight0 = weight0 & 0x0000FFFF;
3724 weight = weight0 | (weight1 << 16);
3725 constant = 128 * weight1;
3729 offset_vec = __msa_fill_w(
offset);
3730 weight_vec = __msa_fill_w(
weight);
3731 rnd_vec = __msa_fill_w(rnd_val + 1);
3737 src0_ptr += (3 * src_stride);
3739 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3740 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3742 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3743 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3744 src0_ptr += (6 * src_stride);
3745 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3746 src1_ptr += (8 * src2_stride);
3752 src32_r, src43_r, src54_r, src65_r);
3753 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3754 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3755 src4332, src6554, src8776);
3763 src0_ptr += (2 * src_stride);
3765 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3766 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3771 weight_vec, rnd_vec, offset_vec,
3772 dst10, dst32, dst54, dst76);
3774 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3775 ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
3776 dst += (8 * dst_stride);
3782 const int16_t *src1_ptr,
3797 weight0, weight1, offset0, offset1, rnd_val);
3798 }
else if (4 ==
height) {
3801 weight0, weight1, offset0, offset1, rnd_val);
3802 }
else if (0 == (
height % 8)) {
3804 src1_ptr, src2_stride,
3806 weight0, weight1, offset0, offset1,
3813 const int16_t *src1_ptr,
3829 v8i16 in0, in1, in2, in3;
3830 v16i8 src10_r, src32_r, src21_r, src43_r;
3831 v8i16 tmp0, tmp1, tmp2, tmp3;
3834 v4i32 weight_vec, offset_vec, rnd_vec;
3836 src0_ptr -= src_stride;
3838 offset = (offset0 + offset1) << rnd_val;
3839 weight0 = weight0 & 0x0000FFFF;
3840 weight = weight0 | (weight1 << 16);
3841 constant = 128 * weight1;
3845 offset_vec = __msa_fill_w(
offset);
3846 weight_vec = __msa_fill_w(
weight);
3847 rnd_vec = __msa_fill_w(rnd_val + 1);
3853 src0_ptr += (3 * src_stride);
3857 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3858 LD_SB2(src0_ptr, src_stride, src3, src4);
3859 src0_ptr += (2 * src_stride);
3860 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3861 src1_ptr += (4 * src2_stride);
3869 src0_ptr += (2 * src_stride);
3877 weight_vec, rnd_vec, offset_vec,
3878 tmp0, tmp1, tmp2, tmp3);
3881 ST_W2(tmp0, 0, 2,
dst, dst_stride);
3882 ST_H2(tmp0, 2, 6,
dst + 4, dst_stride);
3883 ST_W2(tmp1, 0, 2,
dst + 2 * dst_stride, dst_stride);
3884 ST_H2(tmp1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
3885 dst += (4 * dst_stride);
3888 LD_SB2(src0_ptr, src_stride, src3, src4);
3889 src0_ptr += (2 * src_stride);
3890 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3891 src1_ptr += (4 * src2_stride);
3899 src0_ptr += (2 * src_stride);
3907 weight_vec, rnd_vec, offset_vec,
3908 tmp0, tmp1, tmp2, tmp3);
3911 ST_W2(tmp0, 0, 2,
dst, dst_stride);
3912 ST_H2(tmp0, 2, 6,
dst + 4, dst_stride);
3913 ST_W2(tmp1, 0, 2,
dst + 2 * dst_stride, dst_stride);
3914 ST_H2(tmp1, 2, 6,
dst + 2 * dst_stride + 4, dst_stride);
3920 const int16_t *src1_ptr,
3933 v8i16 in0, in1, tmp0, tmp1;
3934 v16i8 src10_r, src32_r, src21_r, src43_r;
3937 v4i32 weight_vec, offset_vec, rnd_vec;
3939 src0_ptr -= src_stride;
3941 offset = (offset0 + offset1) << rnd_val;
3942 weight0 = weight0 & 0x0000FFFF;
3943 weight = weight0 | (weight1 << 16);
3944 constant = 128 * weight1;
3948 offset_vec = __msa_fill_w(
offset);
3949 weight_vec = __msa_fill_w(
weight);
3950 rnd_vec = __msa_fill_w(rnd_val + 1);
3956 src0_ptr += (3 * src_stride);
3960 LD_SB2(src0_ptr, src_stride, src3, src4);
3961 LD_SH2(src1_ptr, src2_stride, in0, in1);
3968 weight_vec, rnd_vec, offset_vec,
3971 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3972 ST_D2(tmp0, 0, 1,
dst, dst_stride);
3977 const int16_t *src1_ptr,
3989 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
3990 v8i16 in0, in1, in2, in3, in4, in5;
3991 v16i8 src10_r, src32_r, src54_r, src76_r;
3992 v16i8 src21_r, src43_r, src65_r, src87_r;
3993 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3996 v4i32 weight_vec, offset_vec, rnd_vec;
3998 src0_ptr -= src_stride;
4000 offset = (offset0 + offset1) << rnd_val;
4001 weight0 = weight0 & 0x0000FFFF;
4002 weight = weight0 | (weight1 << 16);
4003 constant = 128 * weight1;
4007 offset_vec = __msa_fill_w(
offset);
4008 weight_vec = __msa_fill_w(
weight);
4009 rnd_vec = __msa_fill_w(rnd_val + 1);
4015 src0_ptr += (3 * src_stride);
4019 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
4020 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4023 src32_r, src43_r, src54_r, src65_r);
4024 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4034 weight_vec, rnd_vec, offset_vec,
4035 tmp0, tmp1, tmp2, tmp3);
4037 weight_vec, rnd_vec, offset_vec,
4041 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4042 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
4043 ST_D2(tmp3, 0, 1,
dst + 4 * dst_stride, dst_stride);
4048 const int16_t *src1_ptr,
4063 v8i16 in0, in1, in2, in3;
4064 v16i8 src10_r, src32_r, src21_r, src43_r;
4065 v8i16 tmp0, tmp1, tmp2, tmp3;
4068 v4i32 weight_vec, offset_vec, rnd_vec;
4070 src0_ptr -= src_stride;
4072 offset = (offset0 + offset1) << rnd_val;
4073 weight0 = weight0 & 0x0000FFFF;
4074 weight = weight0 | (weight1 << 16);
4075 constant = 128 * weight1;
4079 offset_vec = __msa_fill_w(
offset);
4080 weight_vec = __msa_fill_w(
weight);
4081 rnd_vec = __msa_fill_w(rnd_val + 1);
4087 src0_ptr += (3 * src_stride);
4091 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4092 LD_SB2(src0_ptr, src_stride, src3, src4);
4093 src0_ptr += (2 * src_stride);
4094 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4095 src1_ptr += (4 * src2_stride);
4103 src0_ptr += (2 * src_stride);
4111 weight_vec, rnd_vec, offset_vec,
4112 tmp0, tmp1, tmp2, tmp3);
4115 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
4116 dst += (4 * dst_stride);
4122 const int16_t *src1_ptr,
4137 weight0, weight1, offset0, offset1, rnd_val);
4138 }
else if (6 ==
height) {
4141 weight0, weight1, offset0, offset1, rnd_val);
4144 src1_ptr, src2_stride,
4146 weight0, weight1, offset0, offset1,
4153 const int16_t *src1_ptr,
4168 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4169 v16i8 src10_r, src32_r, src21_r, src43_r;
4170 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4171 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4172 v16i8 src2110, src4332;
4175 v4i32 weight_vec, offset_vec, rnd_vec;
4177 src0_ptr -= (1 * src_stride);
4179 offset = (offset0 + offset1) << rnd_val;
4180 weight0 = weight0 & 0x0000FFFF;
4181 weight = weight0 | (weight1 << 16);
4182 constant = 128 * weight1;
4186 offset_vec = __msa_fill_w(
offset);
4187 weight_vec = __msa_fill_w(
weight);
4188 rnd_vec = __msa_fill_w(rnd_val + 1);
4194 src0_ptr += (3 * src_stride);
4198 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4200 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4201 LD_SB2(src0_ptr, src_stride, src3, src4);
4202 src0_ptr += (2 * src_stride);
4203 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4204 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4205 src1_ptr += (4 * src2_stride);
4211 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4218 src0_ptr += (2 * src_stride);
4222 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4229 weight_vec, rnd_vec, offset_vec,
4230 tmp0, tmp1, tmp2, tmp3);
4232 weight_vec, rnd_vec, offset_vec,
4236 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4237 ST_D4(tmp0, tmp1, 0, 1, 0, 1,
dst, dst_stride);
4238 ST_W4(tmp2, 0, 1, 2, 3,
dst + 8, dst_stride);
4239 dst += (4 * dst_stride);
4245 const int16_t *src1_ptr,
4260 v8i16 in0, in1, in2, in3;
4261 v16i8 src10_r, src32_r, src21_r, src43_r;
4262 v16i8 src10_l, src32_l, src21_l, src43_l;
4263 v8i16 tmp0, tmp1, tmp2, tmp3;
4266 v4i32 weight_vec, offset_vec, rnd_vec;
4268 src0_ptr -= src_stride;
4270 offset = (offset0 + offset1) << rnd_val;
4271 weight0 = weight0 & 0x0000FFFF;
4272 weight = weight0 | (weight1 << 16);
4273 constant = 128 * weight1;
4277 offset_vec = __msa_fill_w(
offset);
4278 weight_vec = __msa_fill_w(
weight);
4279 rnd_vec = __msa_fill_w(rnd_val + 1);
4285 src0_ptr += (3 * src_stride);
4290 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4291 LD_SB2(src0_ptr, src_stride, src3, src4);
4292 src0_ptr += (2 * src_stride);
4293 LD_SH2(src1_ptr, src2_stride, in0, in1);
4294 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4295 src1_ptr += (2 * src2_stride);
4307 weight_vec, rnd_vec, offset_vec,
4308 tmp0, tmp1, tmp2, tmp3);
4311 dst += (2 * dst_stride);
4313 src0_ptr += (2 * src_stride);
4315 LD_SH2(src1_ptr, src2_stride, in0, in1);
4316 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4317 src1_ptr += (2 * src2_stride);
4328 weight_vec, rnd_vec, offset_vec,
4329 tmp0, tmp1, tmp2, tmp3);
4333 dst += (2 * dst_stride);
4339 const int16_t *src1_ptr,
4354 v16i8 src6, src7, src8, src9, src10, src11;
4355 v8i16 in0, in1, in2, in3, in4, in5;
4356 v16i8 src10_r, src32_r, src76_r, src98_r;
4357 v16i8 src10_l, src32_l, src21_l, src43_l;
4358 v16i8 src21_r, src43_r, src87_r, src109_r;
4359 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4362 v4i32 weight_vec, offset_vec, rnd_vec;
4364 src0_ptr -= src_stride;
4366 offset = (offset0 + offset1) << rnd_val;
4367 weight0 = weight0 & 0x0000FFFF;
4368 weight = weight0 | (weight1 << 16);
4369 constant = 128 * weight1;
4373 offset_vec = __msa_fill_w(
offset);
4374 weight_vec = __msa_fill_w(
weight);
4375 rnd_vec = __msa_fill_w(rnd_val + 1);
4386 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4387 src0_ptr += (3 * src_stride);
4389 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4391 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4393 LD_SB2(src0_ptr, src_stride, src3, src4);
4394 LD_SH2(src1_ptr, src2_stride, in0, in1);
4395 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4401 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4402 src0_ptr += (2 * src_stride);
4403 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4404 src1_ptr += (2 * src2_stride);
4406 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4418 weight_vec, rnd_vec, offset_vec,
4419 tmp0, tmp1, tmp4, tmp5);
4422 weight_vec, rnd_vec, offset_vec,
4427 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4429 ST_D2(tmp2, 0, 1,
dst + 16, dst_stride);
4430 dst += (2 * dst_stride);
4434 LD_SH2(src1_ptr, src2_stride, in0, in1);
4435 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4440 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4441 src0_ptr += (2 * src_stride);
4442 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4443 src1_ptr += (2 * src2_stride);
4445 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4457 weight_vec, rnd_vec, offset_vec,
4458 tmp0, tmp1, tmp4, tmp5);
4461 weight_vec, rnd_vec, offset_vec,
4467 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4469 ST_D2(tmp2, 0, 1,
dst + 16, dst_stride);
4470 dst += (2 * dst_stride);
4476 const int16_t *src1_ptr,
4489 uint8_t *dst_tmp =
dst + 16;
4491 v16i8
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
4492 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4493 v16i8 src10_r, src32_r, src76_r, src98_r;
4494 v16i8 src21_r, src43_r, src87_r, src109_r;
4495 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4496 v16i8 src10_l, src32_l, src76_l, src98_l;
4497 v16i8 src21_l, src43_l, src87_l, src109_l;
4500 v4i32 weight_vec, offset_vec, rnd_vec;
4502 src0_ptr -= src_stride;
4504 offset = (offset0 + offset1) << rnd_val;
4505 weight0 = weight0 & 0x0000FFFF;
4506 weight = weight0 | (weight1 << 16);
4507 constant = 128 * weight1;
4511 offset_vec = __msa_fill_w(
offset);
4512 weight_vec = __msa_fill_w(
weight);
4513 rnd_vec = __msa_fill_w(rnd_val + 1);
4524 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4525 src0_ptr += (3 * src_stride);
4527 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4528 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4530 for (loop_cnt = (
height >> 1); loop_cnt--;) {
4532 LD_SB2(src0_ptr, src_stride, src3, src4);
4533 LD_SH2(src1_ptr, src2_stride, in0, in1);
4534 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4547 weight_vec, rnd_vec, offset_vec,
4548 tmp0, tmp1, tmp4, tmp5);
4552 dst += (2 * dst_stride);
4561 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4562 src0_ptr += (2 * src_stride);
4563 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4564 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4565 src1_ptr += (2 * src2_stride);
4567 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4568 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4577 weight_vec, rnd_vec, offset_vec,
4578 tmp2, tmp3, tmp6, tmp7);
4582 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4583 dst_tmp += (2 * dst_stride);
4595 const int16_t *src1_ptr,
4599 const int8_t *filter_x,
4600 const int8_t *filter_y,
4613 v8i16 filt_h0, filt_h1;
4616 v8i16 filter_vec,
tmp, weight_vec;
4617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4618 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4619 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4621 src0_ptr -= (src_stride + 1);
4623 filter_vec =
LD_SH(filter_x);
4626 filter_vec =
LD_SH(filter_y);
4633 offset = (offset0 + offset1) << rnd_val;
4634 weight0 = weight0 & 0x0000FFFF;
4635 weight = weight0 | (weight1 << 16);
4637 const_vec = __msa_fill_w((128 * weight1));
4639 offset_vec = __msa_fill_w(
offset);
4640 weight_vec = (v8i16) __msa_fill_w(
weight);
4641 rnd_vec = __msa_fill_w(rnd_val + 1);
4642 offset_vec += const_vec;
4662 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4664 LD2(src1_ptr, src2_stride, tp0, tp1);
4668 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4669 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4672 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4673 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
4679 const int16_t *src1_ptr,
4683 const int8_t *filter_x,
4684 const int8_t *filter_y,
4694 v8i16 in0 = { 0 }, in1 = { 0 };
4697 v8i16 filt_h0, filt_h1;
4700 v8i16 filter_vec, weight_vec;
4701 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4702 v8i16 tmp0, tmp1, tmp2, tmp3;
4703 v8i16 dst30, dst41, dst52, dst63;
4704 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4705 v4i32 offset_vec, rnd_vec, const_vec;
4706 v4i32 dst0, dst1, dst2, dst3;
4708 src0_ptr -= (src_stride + 1);
4710 filter_vec =
LD_SH(filter_x);
4713 filter_vec =
LD_SH(filter_y);
4720 offset = (offset0 + offset1) << rnd_val;
4721 weight0 = weight0 & 0x0000FFFF;
4722 weight = weight0 | (weight1 << 16);
4724 const_vec = __msa_fill_w((128 * weight1));
4726 offset_vec = __msa_fill_w(
offset);
4727 weight_vec = (v8i16) __msa_fill_w(
weight);
4728 rnd_vec = __msa_fill_w(rnd_val + 1);
4729 offset_vec += const_vec;
4737 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4751 SRA_4V(dst0, dst1, dst2, dst3, 6);
4754 LD2(src1_ptr, src2_stride, tp0, tp1);
4756 src1_ptr += (2 * src2_stride);
4757 LD2(src1_ptr, src2_stride, tp0, tp1);
4763 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4764 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4765 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4766 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4770 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4776 const int16_t *src1_ptr,
4780 const int8_t *filter_x,
4781 const int8_t *filter_y,
4793 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4794 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4796 v8i16 filt_h0, filt_h1;
4797 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4800 v8i16 filter_vec, weight_vec;
4801 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4802 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4803 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4804 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4805 v8i16 dst98_r, dst109_r;
4806 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4807 v4i32 offset_vec, rnd_vec, const_vec;
4809 src0_ptr -= (src_stride + 1);
4811 filter_vec =
LD_SH(filter_x);
4814 filter_vec =
LD_SH(filter_y);
4821 offset = (offset0 + offset1) << rnd_val;
4822 weight0 = weight0 & 0x0000FFFF;
4823 weight = weight0 | (weight1 << 16);
4825 const_vec = __msa_fill_w((128 * weight1));
4827 offset_vec = __msa_fill_w(
offset);
4828 weight_vec = (v8i16) __msa_fill_w(
weight);
4829 rnd_vec = __msa_fill_w(rnd_val + 1);
4830 offset_vec += const_vec;
4833 src0_ptr += (3 * src_stride);
4841 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4843 for (loop_cnt =
height >> 3; loop_cnt--;) {
4844 LD_SB8(src0_ptr, src_stride,
4845 src3, src4, src5, src6, src7, src8, src9, src10);
4846 src0_ptr += (8 * src_stride);
4848 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4849 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4850 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4851 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4858 dst32_r = __msa_ilvr_h(dst73, dst22);
4862 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4863 dst76_r = __msa_ilvr_h(dst22, dst106);
4865 LD2(src1_ptr, src2_stride, tp0, tp1);
4866 src1_ptr += 2 * src2_stride;
4868 LD2(src1_ptr, src2_stride, tp0, tp1);
4869 src1_ptr += 2 * src2_stride;
4872 LD2(src1_ptr, src2_stride, tp0, tp1);
4873 src1_ptr += 2 * src2_stride;
4875 LD2(src1_ptr, src2_stride, tp0, tp1);
4876 src1_ptr += 2 * src2_stride;
4887 SRA_4V(dst0, dst1, dst2, dst3, 6);
4888 SRA_4V(dst4, dst5, dst6, dst7, 6);
4889 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4895 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4896 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4897 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4898 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4899 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4900 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4901 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4902 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4907 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4910 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
4911 dst += (8 * dst_stride);
4915 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4921 const int16_t *src1_ptr,
4925 const int8_t *filter_x,
4926 const int8_t *filter_y,
4936 dst, dst_stride, filter_x, filter_y,
4937 weight0, weight1, offset0, offset1, rnd_val);
4938 }
else if (4 ==
height) {
4940 dst, dst_stride, filter_x, filter_y,
4941 weight0, weight1, offset0, offset1, rnd_val);
4942 }
else if (0 == (
height % 8)) {
4944 src1_ptr, src2_stride,
4945 dst, dst_stride, filter_x, filter_y,
4946 height, weight0, weight1,
4947 offset0, offset1, rnd_val);
4953 const int16_t *src1_ptr,
4957 const int8_t *filter_x,
4958 const int8_t *filter_y,
4966 uint32_t tpw0, tpw1, tpw2, tpw3;
4969 v16u8 out0, out1, out2;
4970 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4971 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4972 v8i16 in4 = { 0 }, in5 = { 0 };
4974 v8i16 filt_h0, filt_h1, filter_vec;
4975 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4978 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4979 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4980 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4981 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4982 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4983 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4984 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4985 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4986 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4987 v4i32 offset_vec, rnd_vec, const_vec;
4989 src0_ptr -= (src_stride + 1);
4991 filter_vec =
LD_SH(filter_x);
4994 filter_vec =
LD_SH(filter_y);
5001 offset = (offset0 + offset1) << rnd_val;
5002 weight0 = weight0 & 0x0000FFFF;
5003 weight = weight0 | (weight1 << 16);
5005 const_vec = __msa_fill_w((128 * weight1));
5007 offset_vec = __msa_fill_w(
offset);
5008 weight_vec = (v8i16) __msa_fill_w(
weight);
5009 rnd_vec = __msa_fill_w(rnd_val + 1);
5010 offset_vec += const_vec;
5013 src0_ptr += (3 * src_stride);
5026 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5030 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5031 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5032 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5033 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5040 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
5041 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
5042 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
5043 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
5058 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
5059 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
5060 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
5073 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
5074 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
5075 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
5076 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
5077 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
5078 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
5080 LD2(src1_ptr, src2_stride, tp0, tp1);
5082 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
5085 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
5087 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
5094 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5095 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5096 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5097 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5098 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5099 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5100 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5101 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5106 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5109 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
5111 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5113 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5114 src1_ptr += (4 * src2_stride);
5116 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5122 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5123 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5124 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5125 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5130 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5131 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7,
dst + 4, dst_stride);
5136 const int16_t *src1_ptr,
5140 const int8_t *filter_x,
5141 const int8_t *filter_y,
5152 v8i16 filt_h0, filt_h1;
5155 v8i16 filter_vec, weight_vec;
5156 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5157 v8i16 dst0, dst1, dst2, dst3, dst4;
5159 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5160 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5161 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5162 v8i16 tmp0, tmp1, tmp2, tmp3;
5163 v4i32 offset_vec, rnd_vec, const_vec;
5165 src0_ptr -= (src_stride + 1);
5167 filter_vec =
LD_SH(filter_x);
5170 filter_vec =
LD_SH(filter_y);
5177 offset = (offset0 + offset1) << rnd_val;
5178 weight0 = weight0 & 0x0000FFFF;
5179 weight = weight0 | (weight1 << 16);
5181 const_vec = __msa_fill_w((128 * weight1));
5183 offset_vec = __msa_fill_w(
offset);
5184 weight_vec = (v8i16) __msa_fill_w(
weight);
5185 rnd_vec = __msa_fill_w(rnd_val + 1);
5186 offset_vec += const_vec;
5191 LD_SH2(src1_ptr, src2_stride, in0, in1);
5196 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5197 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5213 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5214 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5219 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5220 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5221 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5222 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5223 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5225 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5226 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5232 const int16_t *src1_ptr,
5236 const int8_t *filter_x,
5237 const int8_t *filter_y,
5248 v16i8
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
5249 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5250 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5251 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5252 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5253 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5254 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5255 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5256 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5257 v4i32 offset_vec, rnd_vec, const_vec;
5259 src0_ptr -= (src_stride + 1);
5261 filter_vec =
LD_SH(filter_x);
5264 filter_vec =
LD_SH(filter_y);
5272 offset = (offset0 + offset1) << rnd_val;
5273 weight0 = weight0 & 0x0000FFFF;
5274 weight = weight0 | (weight1 << 16);
5276 const_vec = __msa_fill_w((128 * weight1));
5278 offset_vec = __msa_fill_w(
offset);
5279 rnd_vec = __msa_fill_w(rnd_val + 1);
5280 offset_vec += const_vec;
5281 weight_vec = (v8i16) __msa_fill_w(
weight);
5283 for (cnt = width8mult; cnt--;) {
5288 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5302 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5303 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5304 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5305 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5326 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5327 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5328 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5329 dst3_r, dst0, dst1, dst2, dst3);
5335 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5336 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5337 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5338 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5339 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5340 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5341 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5342 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5347 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5348 tmp0, tmp1, tmp2, tmp3);
5350 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
5357 const int16_t *src1_ptr,
5361 const int8_t *filter_x,
5362 const int8_t *filter_y,
5370 v16u8 out0, out1, out2;
5371 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
5373 v8i16 filt_h0, filt_h1;
5376 v8i16 filter_vec, weight_vec;
5377 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5378 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5379 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5380 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5381 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5382 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5383 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5384 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5385 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5386 v8i16 in0, in1, in2, in3, in4, in5;
5387 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5388 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5389 v4i32 offset_vec, rnd_vec, const_vec;
5391 src0_ptr -= (src_stride + 1);
5393 filter_vec =
LD_SH(filter_x);
5396 filter_vec =
LD_SH(filter_y);
5403 offset = (offset0 + offset1) << rnd_val;
5404 weight0 = weight0 & 0x0000FFFF;
5405 weight = weight0 | (weight1 << 16);
5407 const_vec = __msa_fill_w((128 * weight1));
5409 offset_vec = __msa_fill_w(
offset);
5410 weight_vec = (v8i16) __msa_fill_w(
weight);
5411 rnd_vec = __msa_fill_w(rnd_val + 1);
5412 offset_vec += const_vec;
5415 src0_ptr += (5 * src_stride);
5416 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5421 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5426 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5427 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5428 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5429 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5430 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5431 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5465 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5466 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5467 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5468 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5469 dst0, dst1, dst2, dst3);
5475 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5476 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5477 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5478 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5479 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5480 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5481 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5482 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5487 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5488 tmp0, tmp1, tmp2, tmp3);
5491 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5494 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5495 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5496 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5497 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5501 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5502 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
5503 ST_D2(out2, 0, 1,
dst + 4 * dst_stride, dst_stride);
5508 const int16_t *src1_ptr,
5512 const int8_t *filter_x,
5513 const int8_t *filter_y,
5525 const uint8_t *src0_ptr_tmp;
5526 const int16_t *src1_ptr_tmp;
5530 v8i16 in0, in1, in2, in3;
5532 v8i16 filt_h0, filt_h1;
5536 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5537 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5538 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5539 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5540 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5541 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5542 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5543 v4i32 offset_vec, rnd_vec, const_vec;
5545 src0_ptr -= (src_stride + 1);
5547 filter_vec =
LD_SH(filter_x);
5550 filter_vec =
LD_SH(filter_y);
5557 offset = (offset0 + offset1) << rnd_val;
5558 weight0 = weight0 & 0x0000FFFF;
5559 weight = weight0 | (weight1 << 16);
5561 const_vec = __msa_fill_w((128 * weight1));
5563 offset_vec = __msa_fill_w(
offset);
5564 weight_vec = (v8i16) __msa_fill_w(
weight);
5565 rnd_vec = __msa_fill_w(rnd_val + 1);
5566 offset_vec += const_vec;
5568 for (cnt =
width >> 3; cnt--;) {
5569 src0_ptr_tmp = src0_ptr;
5570 src1_ptr_tmp = src1_ptr;
5574 src0_ptr_tmp += (3 * src_stride);
5587 for (loop_cnt =
height >> 2; loop_cnt--;) {
5588 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5589 src0_ptr_tmp += (4 * src_stride);
5590 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5591 src1_ptr_tmp += (4 * src2_stride);
5594 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5595 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5596 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5597 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5618 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5619 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5620 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5621 dst3_r, dst0, dst1, dst2, dst3);
5626 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5627 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5628 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5629 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5630 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5631 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5632 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5633 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5638 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5639 tmp0, tmp1, tmp2, tmp3);
5641 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5642 dst_tmp += (4 * dst_stride);
5659 const int16_t *src1_ptr,
5663 const int8_t *filter_x,
5664 const int8_t *filter_y,
5674 dst, dst_stride, filter_x, filter_y,
5675 weight0, weight1, offset0, offset1, rnd_val);
5676 }
else if (4 ==
height) {
5678 src2_stride,
dst, dst_stride, filter_x,
5679 filter_y, weight0, weight1, offset0,
5680 offset1, rnd_val, 1);
5681 }
else if (6 ==
height) {
5683 dst, dst_stride, filter_x, filter_y,
5684 weight0, weight1, offset0, offset1, rnd_val);
5685 }
else if (0 == (
height % 4)) {
5687 src1_ptr, src2_stride,
5688 dst, dst_stride, filter_x, filter_y,
5690 weight1, offset0, offset1, rnd_val, 8);
5696 const int16_t *src1_ptr,
5700 const int8_t *filter_x,
5701 const int8_t *filter_y,
5712 const uint8_t *src0_ptr_tmp;
5713 const int16_t *src1_ptr_tmp;
5716 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
5717 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5718 v16i8 mask0, mask1, mask2, mask3;
5719 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5720 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5721 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5722 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5723 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5724 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5725 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5726 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5727 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5728 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5729 v4i32 offset_vec, rnd_vec, const_vec;
5731 src0_ptr -= (src_stride + 1);
5733 filter_vec =
LD_SH(filter_x);
5736 filter_vec =
LD_SH(filter_y);
5744 offset = (offset0 + offset1) << rnd_val;
5745 weight0 = weight0 & 0x0000FFFF;
5746 weight = weight0 | (weight1 << 16);
5748 const_vec = __msa_fill_w((128 * weight1));
5750 offset_vec = __msa_fill_w(
offset);
5751 rnd_vec = __msa_fill_w(rnd_val + 1);
5752 offset_vec += const_vec;
5753 weight_vec = (v8i16) __msa_fill_w(
weight);
5755 src0_ptr_tmp = src0_ptr;
5757 src1_ptr_tmp = src1_ptr;
5760 src0_ptr_tmp += (3 * src_stride);
5775 for (loop_cnt = 4; loop_cnt--;) {
5776 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5777 src0_ptr_tmp += (4 * src_stride);
5780 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5781 src1_ptr_tmp += (4 * src2_stride);
5783 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5784 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5785 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5786 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5807 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5808 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5809 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5810 dst3_r, dst0, dst1, dst2, dst3);
5815 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5816 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5817 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5818 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5819 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5820 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5821 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5822 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5827 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5828 tmp0, tmp1, tmp2, tmp3);
5830 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5831 dst_tmp += (4 * dst_stride);
5848 src0_ptr += (3 * src_stride);
5857 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5859 for (loop_cnt = 2; loop_cnt--;) {
5860 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5862 src0_ptr += (8 * src_stride);
5864 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5865 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5866 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5867 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5874 dst32_r = __msa_ilvr_h(dst73, dst22);
5878 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5879 dst76_r = __msa_ilvr_h(dst22, dst106);
5881 LD2(src1_ptr, src2_stride, tp0, tp1);
5882 src1_ptr += 2 * src2_stride;
5884 LD2(src1_ptr, src2_stride, tp0, tp1);
5885 src1_ptr += 2 * src2_stride;
5888 LD2(src1_ptr, src2_stride, tp0, tp1);
5889 src1_ptr += 2 * src2_stride;
5891 LD2(src1_ptr, src2_stride, tp0, tp1);
5892 src1_ptr += 2 * src2_stride;
5904 SRA_4V(dst0, dst1, dst2, dst3, 6);
5905 SRA_4V(dst4, dst5, dst6, dst7, 6);
5906 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5907 dst0, dst1, dst2, dst3);
5912 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5913 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5914 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5915 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5916 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5917 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5918 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5919 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5924 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5925 tmp0, tmp1, tmp2, tmp3);
5927 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst, dst_stride);
5928 dst += (8 * dst_stride);
5932 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5938 const int16_t *src1_ptr,
5942 const int8_t *filter_x,
5943 const int8_t *filter_y,
5953 src2_stride,
dst, dst_stride, filter_x,
5954 filter_y, weight0, weight1, offset0,
5955 offset1, rnd_val, 2);
5958 src2_stride,
dst, dst_stride,
5959 filter_x, filter_y,
height, weight0,
5960 weight1, offset0, offset1, rnd_val, 16);
5966 const int16_t *src1_ptr,
5970 const int8_t *filter_x,
5971 const int8_t *filter_y,
5980 src1_ptr, src2_stride,
5982 filter_x, filter_y,
height, weight0,
5983 weight1, offset0, offset1, rnd_val, 24);
5988 const int16_t *src1_ptr,
5992 const int8_t *filter_x,
5993 const int8_t *filter_y,
6002 src1_ptr, src2_stride,
6004 filter_x, filter_y,
height, weight0,
6005 weight1, offset0, offset1, rnd_val, 32);
6008 #define BI_W_MC_COPY(WIDTH) \
6009 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
6010 ptrdiff_t dst_stride, \
6011 const uint8_t *src, \
6012 ptrdiff_t src_stride, \
6013 const int16_t *src_16bit, \
6024 int shift = 14 + 1 - 8; \
6025 int log2Wd = denom + shift - 1; \
6027 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
6028 dst, dst_stride, height, \
6029 weight0, weight1, offset0, \
6045 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
6046 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
6049 const uint8_t *src, \
6052 const int16_t *src_16bit, \
6063 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
6064 int log2Wd = denom + 14 - 8; \
6066 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6067 MAX_PB_SIZE, dst, dst_stride, \
6068 filter, height, weight0, \
6069 weight1, offset0, offset1, \
6109 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6110 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6111 ptrdiff_t dst_stride, \
6112 const uint8_t *src, \
6113 ptrdiff_t src_stride, \
6114 const int16_t *src_16bit, \
6125 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
6126 const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
6127 int log2Wd = denom + 14 - 8; \
6129 hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6130 MAX_PB_SIZE, dst, dst_stride, \
6131 filter_x, filter_y, height, \
6132 weight0, weight1, offset0, \