26     0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
 
   27     1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
 
   28     2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
 
   31     0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
 
   32     1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
 
   33     2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
 
   36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \ 
   39     v16i8 tmp0_m, tmp1_m;                                                    \ 
   40     v16i8 minus5b_m = __msa_ldi_b(-5);                                       \ 
   41     v16i8 plus20b_m = __msa_ldi_b(20);                                       \ 
   43     ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m);                                 \ 
   44     HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2);                                 \ 
   45     ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m);                                 \ 
   46     DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2);          \ 
   47     ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m);                                 \ 
   48     DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);          \ 
   51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \ 
   55     v16i8 minus5b = __msa_ldi_b(-5);                       \ 
   56     v16i8 plus20b = __msa_ldi_b(20);                       \ 
   58     tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0);        \ 
   59     out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);               \ 
   61     tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0);        \ 
   62     out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);     \ 
   64     tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0);        \ 
   65     out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m);     \ 
   70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)       \ 
   74     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \ 
   75     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \ 
   76     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \ 
   81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)       \ 
   85     out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0);           \ 
   86     out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1);  \ 
   87     out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2);  \ 
   88     out0_m = __msa_srari_w(out0_m, 10);                             \ 
   89     out0_m = __msa_sat_s_w(out0_m, 7);                              \ 
   96     const int16_t filt_const0 = 0xfb01;
 
   97     const int16_t filt_const1 = 0x1414;
 
   98     const int16_t filt_const2 = 0x1fb;
 
  100     v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
 
  101     v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
 
  102     v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
 
  103     v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
 
  104     v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
 
  106     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
  107     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
  108     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
  112     LD_SB5(src_y, 
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
 
  115     src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
 
  116     src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
 
  117     src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
 
  118     src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
 
  122     LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  130     LD_SB4(src_y, 
stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
  132     src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
 
  133     src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
 
  134     src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
 
  135     src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
 
  138     ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
 
  139     ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
 
  140     vt_out0 = 
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
 
  142     vt_out1 = 
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
 
  147     out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  148     out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  158     const int16_t filt_const0 = 0xfb01;
 
  159     const int16_t filt_const1 = 0x1414;
 
  160     const int16_t filt_const2 = 0x1fb;
 
  162     v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
 
  163     v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
 
  164     v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
 
  165     v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
 
  166     v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
 
  167     v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
 
  168     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
 
  169     v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
 
  171     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
  172     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
  173     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
  176     LD_SB5(src_y, 
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
 
  181     LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  190     SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
 
  191     SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
 
  193     LD_SB4(src_y, 
stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
  197     ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
 
  198                src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
 
  199     ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
 
  200                src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
 
  201     vt_out0 = 
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
 
  203     vt_out1 = 
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
 
  205     vt_out2 = 
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
 
  207     vt_out3 = 
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
 
  209     SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
 
  210     SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
 
  212     tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  213     tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  214     tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
 
  215     tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
 
  217     LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  226     LD_SB4(src_y, 
stride, src_vt9, src_vt10, src_vt11, src_vt12);
 
  234     SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
 
  235     SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
 
  237     ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
 
  238                src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
 
  240     vt_out0 = 
AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
 
  242     vt_out1 = 
AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
 
  244     vt_out2 = 
AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
 
  246     vt_out3 = 
AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
 
  248     SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
 
  249     SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
 
  251     tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  252     tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  253     tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
 
  254     tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
 
  263                                       const uint8_t *src_y, uint8_t *dst,
 
  266     const int16_t filt_const0 = 0xfb01;
 
  267     const int16_t filt_const1 = 0x1414;
 
  268     const int16_t filt_const2 = 0x1fb;
 
  269     const uint8_t *src_x_tmp = src_x;
 
  270     const uint8_t *src_y_tmp = src_y;
 
  271     uint8_t *dst_tmp = dst;
 
  272     uint32_t multiple8_cnt, loop_cnt;
 
  274     v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
 
  275     v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
 
  276     v16i8 src_vt7, src_vt8;
 
  277     v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
 
  278     v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
 
  279     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
 
  280     v8i16 vt_out3, out0, out1, out2, out3;
 
  282     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
  283     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
  284     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
  288     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
  293         LD_SB5(src_y, 
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
 
  298         for (loop_cnt = 4; loop_cnt--;) {
 
  299             LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  307             SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
 
  308             SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
 
  310             LD_SB4(src_y, 
stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
  314             ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
 
  315                        src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
 
  317             ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
 
  318                        src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
 
  320             vt_out0 = 
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
 
  322             vt_out1 = 
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
 
  324             vt_out2 = 
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
 
  326             vt_out3 = 
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
 
  328             SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
 
  329             SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
 
  331             out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  332             out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  333             out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
 
  334             out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
 
  356                                                  const uint8_t *src_y,
 
  360     uint32_t tp0, tp1, tp2, tp3;
 
  361     const int16_t filt_const0 = 0xfb01;
 
  362     const int16_t filt_const1 = 0x1414;
 
  363     const int16_t filt_const2 = 0x1fb;
 
  364     v16u8 res, dst0 = { 0 };
 
  365     v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
 
  366     v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
 
  367     v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
 
  368     v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
 
  369     v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
 
  371     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
  372     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
  373     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
  377     LD_SB5(src_y, 
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
 
  380     src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
 
  381     src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
 
  382     src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
 
  383     src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
 
  387     LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  395     LD_SB4(src_y, 
stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
  397     src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
 
  398     src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
 
  399     src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
 
  400     src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
 
  403     ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
 
  404     ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
 
  405     vt_out0 = 
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
 
  407     vt_out1 = 
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
 
  414     res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  415     res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  419     dst0 = __msa_aver_u_b(res, dst0);
 
  425                                                  const uint8_t *src_y,
 
  429     const int16_t filt_const0 = 0xfb01;
 
  430     const int16_t filt_const1 = 0x1414;
 
  431     const int16_t filt_const2 = 0x1fb;
 
  432     uint64_t tp0, tp1, tp2, tp3;
 
  433     v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
 
  434     v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
 
  435     v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
 
  436     v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
 
  437     v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
 
  438     v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
 
  439     v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
 
  440     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
 
  441     v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
 
  443     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
  444     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
  445     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
  448     LD_SB5(src_y, 
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
 
  453     LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  462     SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
 
  463     SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
 
  465     LD_SB4(src_y, 
stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
  469     ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
 
  470                src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
 
  471     ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
 
  472                src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
 
  473     vt_out0 = 
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
 
  475     vt_out1 = 
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
 
  477     vt_out2 = 
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
 
  479     vt_out3 = 
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
 
  481     SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
 
  482     SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
 
  484     tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  485     tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  486     tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
 
  487     tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
 
  489     LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  503     LD_SB4(src_y, 
stride, src_vt9, src_vt10, src_vt11, src_vt12);
 
  511     SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
 
  512     SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
 
  514     ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
 
  515                src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
 
  517     vt_out0 = 
AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
 
  519     vt_out1 = 
AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
 
  521     vt_out2 = 
AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
 
  523     vt_out3 = 
AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
 
  525     SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
 
  526     SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
 
  528     tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  529     tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  530     tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
 
  531     tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
 
  545                                                    const uint8_t *src_y,
 
  549     const int16_t filt_const0 = 0xfb01;
 
  550     const int16_t filt_const1 = 0x1414;
 
  551     const int16_t filt_const2 = 0x1fb;
 
  552     const uint8_t *src_x_tmp = src_x;
 
  553     const uint8_t *src_y_tmp = src_y;
 
  554     uint8_t *dst_tmp = dst;
 
  555     uint32_t multiple8_cnt, loop_cnt;
 
  556     uint64_t tp0, tp1, tp2, tp3;
 
  557     v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
 
  558     v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
 
  559     v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
 
  560     v16i8 src_vt7, src_vt8;
 
  561     v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
 
  562     v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
 
  563     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
 
  564     v8i16 vt_out3, out0, out1, out2, out3;
 
  566     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
  567     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
  568     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
  572     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
  577         LD_SB5(src_y, 
stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
 
  582         for (loop_cnt = 4; loop_cnt--;) {
 
  583             LD_SB4(src_x, 
stride, src_hz0, src_hz1, src_hz2, src_hz3);
 
  591             SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
 
  592             SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
 
  594             LD_SB4(src_y, 
stride, src_vt5, src_vt6, src_vt7, src_vt8);
 
  598             ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
 
  599                        src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
 
  601             ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
 
  602                        src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
 
  604             vt_out0 = 
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
 
  606             vt_out1 = 
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
 
  608             vt_out2 = 
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
 
  610             vt_out3 = 
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
 
  612             SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
 
  613             SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
 
  615             out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
 
  616             out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
 
  617             out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
 
  618             out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
 
  648     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
 
  652     LD_UB8(
src, 
stride, src8, src9, src10, src11, src12, src13, src14, src15);
 
  656     ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, 
stride);
 
  662     uint64_t 
src0, 
src1, 
src2, src3, src4, src5, src6, src7;
 
  669     SD4(src4, src5, src6, src7, dst, 
stride);
 
  676     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
  680     LD_UB8(dst, 
stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
  682     AVER_UB4_UB(
src0, dst0, 
src1, dst1, 
src2, dst2, src3, dst3, dst0, dst1,
 
  684     AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
 
  686     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, 
stride);
 
  690     LD_UB8(dst, 
stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
  692     AVER_UB4_UB(
src0, dst0, 
src1, dst1, 
src2, dst2, src3, dst3, dst0, dst1,
 
  694     AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
 
  696     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, 
stride);
 
  702     uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 
  703     v16u8 
src0 = { 0 }, 
src1 = { 0 }, 
src2 = { 0 }, src3 = { 0 };
 
  704     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
  721     AVER_UB4_UB(
src0, dst0, 
src1, dst1, 
src2, dst2, src3, dst3, dst0, dst1,
 
  724     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  730     uint32_t tp0, tp1, tp2, tp3;
 
  731     v16u8 
src0 = { 0 }, dst0 = { 0 };
 
  738     dst0 = __msa_aver_u_b(
src0, dst0);
 
  747     v16i8 dst0, dst1, dst2, dst3, 
src0, 
src1, 
src2, src3, src4, src5, src6;
 
  748     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
 
  749     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  750     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  751     v16i8 minus5b = __msa_ldi_b(-5);
 
  752     v16i8 plus20b = __msa_ldi_b(20);
 
  760     for (loop_cnt = 4; loop_cnt--;) {
 
  777         HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
 
  778         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
  779                      minus5b, res0, res1, res2, res3);
 
  780         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
  781                      plus20b, res0, res1, res2, res3);
 
  782         VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
 
  783         VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
 
  784         VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
 
  785         VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
 
  786         VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
 
  787         VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
 
  788         HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
 
  789         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
  790                      minus5b, res4, res5, res6, res7);
 
  791         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
  792                      plus20b, res4, res5, res6, res7);
 
  801         dst0 = __msa_aver_s_b(dst0, 
src0);
 
  802         dst1 = __msa_aver_s_b(dst1, 
src2);
 
  803         dst2 = __msa_aver_s_b(dst2, src4);
 
  804         dst3 = __msa_aver_s_b(dst3, src6);
 
  815     v16i8 dst0, dst1, dst2, dst3, 
src0, 
src1, 
src2, src3, src4, src5, src6;
 
  816     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
 
  817     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  818     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  819     v16i8 minus5b = __msa_ldi_b(-5);
 
  820     v16i8 plus20b = __msa_ldi_b(20);
 
  828     for (loop_cnt = 4; loop_cnt--;) {
 
  845         HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
 
  846         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
  847                      minus5b, res0, res1, res2, res3);
 
  848         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
  849                      plus20b, res0, res1, res2, res3);
 
  850         VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
 
  851         VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
 
  852         VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
 
  853         VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
 
  854         VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
 
  855         VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
 
  856         HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
 
  857         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
  858                      minus5b, res4, res5, res6, res7);
 
  859         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
  860                      plus20b, res4, res5, res6, res7);
 
  869         dst0 = __msa_aver_s_b(dst0, 
src0);
 
  870         dst1 = __msa_aver_s_b(dst1, 
src2);
 
  871         dst2 = __msa_aver_s_b(dst2, src4);
 
  872         dst3 = __msa_aver_s_b(dst3, src6);
 
  882     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
  883     v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
 
  884     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  885     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  886     v16i8 minus5b = __msa_ldi_b(-5);
 
  887     v16i8 plus20b = __msa_ldi_b(20);
 
  894     HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
 
  897     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
  898                  res0, res1, res2, res3);
 
  901     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
  902                  res0, res1, res2, res3);
 
  903     VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
 
  904     VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
 
  905     HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
 
  906     VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
 
  907     VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
 
  908     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
  909                  res4, res5, res6, res7);
 
  910     VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
 
  911     VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 
  912     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
  913                  res4, res5, res6, res7);
 
  916     SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
 
  917                src4, src5, src6, src7);
 
  926     tmp0 = __msa_aver_s_b(tmp0, 
src0);
 
  927     tmp1 = __msa_aver_s_b(tmp1, 
src1);
 
  928     tmp2 = __msa_aver_s_b(tmp2, src4);
 
  929     tmp3 = __msa_aver_s_b(tmp3, src5);
 
  931     ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  937     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
  938     v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
 
  939     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  940     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  941     v16i8 minus5b = __msa_ldi_b(-5);
 
  942     v16i8 plus20b = __msa_ldi_b(20);
 
  949     HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
 
  952     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
  953                  res0, res1, res2, res3);
 
  956     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
  957                  res0, res1, res2, res3);
 
  958     VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
 
  959     VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
 
  960     HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
 
  961     VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
 
  962     VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
 
  963     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
  964                  res4, res5, res6, res7);
 
  965     VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
 
  966     VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 
  967     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
  968                  res4, res5, res6, res7);
 
  971     SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
 
  972                src4, src5, src6, src7);
 
  981     tmp0 = __msa_aver_s_b(tmp0, 
src0);
 
  982     tmp1 = __msa_aver_s_b(tmp1, 
src1);
 
  983     tmp2 = __msa_aver_s_b(tmp2, src4);
 
  984     tmp3 = __msa_aver_s_b(tmp3, src5);
 
  986     ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  993     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
  995     v16i8 minus5b = __msa_ldi_b(-5);
 
  996     v16i8 plus20b = __msa_ldi_b(20);
 
 1004     DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
 
 1006     DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
 
 1009     res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 1012     src0 = (v16i8) __msa_insve_w((v4i32) 
src0, 1, (v4i32) 
src1);
 
 1013     src1 = (v16i8) __msa_insve_w((v4i32) 
src2, 1, (v4i32) src3);
 
 1014     src0 = (v16i8) __msa_insve_d((v2i64) 
src0, 1, (v2i64) 
src1);
 
 1015     res = __msa_aver_s_b(res, 
src0);
 
 1016     res = (v16i8) __msa_xori_b((v16u8) res, 128);
 
 1023     v16i8 
src0, 
src1, 
src2, src3, res, mask0, mask1, mask2;
 
 1024     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 1026     v16i8 minus5b = __msa_ldi_b(-5);
 
 1027     v16i8 plus20b = __msa_ldi_b(20);
 
 1035     DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
 
 1037     DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
 
 1040     res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 1043     src0 = (v16i8) __msa_insve_w((v4i32) 
src0, 1, (v4i32) 
src1);
 
 1044     src1 = (v16i8) __msa_insve_w((v4i32) 
src2, 1, (v4i32) src3);
 
 1045     src0 = (v16i8) __msa_insve_d((v2i64) 
src0, 1, (v2i64) 
src1);
 
 1046     res = __msa_aver_s_b(res, 
src0);
 
 1047     res = (v16i8) __msa_xori_b((v16u8) res, 128);
 
 1055     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
 1056     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 1058     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 1059     v16i8 minus5b = __msa_ldi_b(-5);
 
 1060     v16i8 plus20b = __msa_ldi_b(20);
 
 1065     for (loop_cnt = 4; loop_cnt--;) {
 
 1082         HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
 
 1083         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 1084                      minus5b, res0, res1, res2, res3);
 
 1085         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 1086                      plus20b, res0, res1, res2, res3);
 
 1087         VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
 
 1088         VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
 
 1089         VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
 
 1090         VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
 
 1091         VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
 
 1092         VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
 
 1093         HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
 
 1094         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 1095                      minus5b, res4, res5, res6, res7);
 
 1096         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 1097                      plus20b, res4, res5, res6, res7);
 
 1102         PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
 
 1113     v16u8 out0, out1, out2, out3;
 
 1114     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
 1115     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 1117     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 1118     v16i8 minus5b = __msa_ldi_b(-5);
 
 1119     v16i8 plus20b = __msa_ldi_b(20);
 
 1126     HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
 
 1129     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 1130                  res0, res1, res2, res3);
 
 1133     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
 
 1134                  plus20b, res0, res1, res2, res3);
 
 1135     VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
 
 1136     VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
 
 1137     HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
 
 1138     VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
 
 1139     VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
 
 1140     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 1141                  res4, res5, res6, res7);
 
 1142     VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
 
 1143     VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 
 1144     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
 
 1145                  plus20b, res4, res5, res6, res7);
 
 1154     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1162     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 1164     v16i8 minus5b = __msa_ldi_b(-5);
 
 1165     v16i8 plus20b = __msa_ldi_b(20);
 
 1173     DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
 
 1175     DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
 
 1186     int16_t filt_const0 = 0xfb01;
 
 1187     int16_t filt_const1 = 0x1414;
 
 1188     int16_t filt_const2 = 0x1fb;
 
 1189     v16u8 res0, res1, res2, res3;
 
 1190     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1191     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 1192     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 
 1193     v16i8 src65_l, src87_l, filt0, filt1, filt2;
 
 1194     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 1196     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 1197     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 1198     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 1206     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 1208     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 1211     for (loop_cnt = 4; loop_cnt--;) {
 
 1216         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
 
 1217                    src65_r, src76_r, src87_r);
 
 1218         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
 
 1219                    src65_l, src76_l, src87_l);
 
 1220         out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 1221         out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 1222         out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 1223         out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 1224         out0_l = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 1225         out1_l = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 1226         out2_l = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 1227         out3_l = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 1229         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1231         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 1232         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 1233                     out3_r, res0, res1, res2, res3);
 
 1234         res0 = (v16u8) __msa_aver_s_b((v16i8) res0, 
src2);
 
 1235         res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
 
 1236         res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
 
 1237         res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
 
 1260     int16_t filt_const0 = 0xfb01;
 
 1261     int16_t filt_const1 = 0x1414;
 
 1262     int16_t filt_const2 = 0x1fb;
 
 1263     v16u8 res0, res1, res2, res3;
 
 1264     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1265     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 1266     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 
 1267     v16i8 src65_l, src87_l, filt0, filt1, filt2;
 
 1268     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 1270     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 1271     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 1272     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 1280     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 1282     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 1285     for (loop_cnt = 4; loop_cnt--;) {
 
 1290         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
 
 1291                    src65_r, src76_r, src87_r);
 
 1292         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
 
 1293                    src65_l, src76_l, src87_l);
 
 1294         out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 1295         out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 1296         out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 1297         out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 1298         out0_l = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 1299         out1_l = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 1300         out2_l = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 1301         out3_l = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 1303         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1305         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 1306         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 1307                     out3_r, res0, res1, res2, res3);
 
 1308         res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
 
 1309         res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
 
 1310         res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
 
 1311         res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
 
 1332     const int16_t filt_const0 = 0xfb01;
 
 1333     const int16_t filt_const1 = 0x1414;
 
 1334     const int16_t filt_const2 = 0x1fb;
 
 1335     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1336     v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
 
 1337     v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
 
 1338     v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
 
 1339     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
 
 1341     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 1342     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 1343     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 1349     LD_SB8(
src, 
stride, src5, src6, src7, src8, src9, src10, src11, src12);
 
 1350     XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
 
 1352     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 1354     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 1356     ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
 
 1357                src109_r, src1110_r, src1211_r);
 
 1358     out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 1359     out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 1360     out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 1361     out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 1362     out4_r = 
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
 
 1363     out5_r = 
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
 
 1364     out6_r = 
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
 
 1365     out7_r = 
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
 
 1370     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1371     SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
 
 1372     PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
 
 1373     PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
 
 1374     out0 = __msa_aver_s_b(out0, tmp0);
 
 1375     out1 = __msa_aver_s_b(out1, tmp1);
 
 1376     out2 = __msa_aver_s_b(out2, tmp2);
 
 1377     out3 = __msa_aver_s_b(out3, tmp3);
 
 1379     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1385     const int16_t filt_const0 = 0xfb01;
 
 1386     const int16_t filt_const1 = 0x1414;
 
 1387     const int16_t filt_const2 = 0x1fb;
 
 1388     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1389     v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
 
 1390     v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
 
 1391     v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
 
 1392     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
 
 1394     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 1395     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 1396     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 1402     LD_SB8(
src, 
stride, src5, src6, src7, src8, src9, src10, src11, src12);
 
 1404     XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
 
 1405     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 1407     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 1409     ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
 
 1410                src109_r, src1110_r, src1211_r);
 
 1411     out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 1412     out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 1413     out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 1414     out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 1415     out4_r = 
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
 
 1416     out5_r = 
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
 
 1417     out6_r = 
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
 
 1418     out7_r = 
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
 
 1423     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1424     SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
 
 1425     PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
 
 1426     PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
 
 1427     out0 = __msa_aver_s_b(out0, tmp0);
 
 1428     out1 = __msa_aver_s_b(out1, tmp1);
 
 1429     out2 = __msa_aver_s_b(out2, tmp2);
 
 1430     out3 = __msa_aver_s_b(out3, tmp3);
 
 1432     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1438     int16_t filt_const0 = 0xfb01;
 
 1439     int16_t filt_const1 = 0x1414;
 
 1440     int16_t filt_const2 = 0x1fb;
 
 1442     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1443     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 1444     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
 
 1447     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 1448     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 1449     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 1455     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 1457     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 1460     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 1462     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
 
 1464     out10 = 
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
 
 1465     out32 = 
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
 
 1469     src32_r = (v16i8) __msa_insve_w((v4i32) 
src2, 1, (v4i32) src3);
 
 1470     src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
 
 1471     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
 
 1472     out = __msa_aver_u_b(
out, (v16u8) src32_r);
 
 1479     int16_t filt_const0 = 0xfb01;
 
 1480     int16_t filt_const1 = 0x1414;
 
 1481     int16_t filt_const2 = 0x1fb;
 
 1483     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1484     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 1485     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
 
 1488     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 1489     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 1490     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 1496     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 1498     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 1501     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 1503     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
 
 1505     out10 = 
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
 
 1506     out32 = 
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
 
 1510     src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
 
 1511     src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
 
 1512     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
 
 1513     out = __msa_aver_u_b(
out, (v16u8) src32_r);
 
 1597     uint8_t *dst_tmp = dst;
 
 1598     const uint8_t *src_tmp = 
src - (2 * 
stride) - 2;
 
 1599     uint32_t multiple8_cnt, loop_cnt;
 
 1600     const int32_t filt_const0 = 0xfffb0001;
 
 1601     const int32_t filt_const1 = 0x140014;
 
 1602     const int32_t filt_const2 = 0x1fffb;
 
 1604     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
 
 1606     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1607     v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1608     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 1609     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
 
 1610     v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
 
 1611     v8i16 hz_out87_l, filt0, filt1, filt2;
 
 1614     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 1615     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 1616     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 1620     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 1634         for (loop_cnt = 4; loop_cnt--;) {
 
 1645             ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 1646                        hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
 
 1648             ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 1649                        hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
 
 1651             ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 1652                        hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
 
 1654             ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 1655                        hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
 
 1662             dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1667             dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1672             dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1677             dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1679             dst1 = __msa_srari_h(hz_out2, 5);
 
 1680             dst3 = __msa_srari_h(hz_out3, 5);
 
 1681             dst5 = __msa_srari_h(hz_out4, 5);
 
 1682             dst7 = __msa_srari_h(hz_out5, 5);
 
 1685             dst0 = __msa_aver_s_h(dst0, dst1);
 
 1686             dst1 = __msa_aver_s_h(dst2, dst3);
 
 1687             dst2 = __msa_aver_s_h(dst4, dst5);
 
 1688             dst3 = __msa_aver_s_h(dst6, dst7);
 
 1710     uint8_t *dst_tmp = dst;
 
 1711     const uint8_t *src_tmp = 
src - (2 * 
stride) - 2;
 
 1712     uint32_t multiple8_cnt, loop_cnt;
 
 1713     const int32_t filt_const0 = 0xfffb0001;
 
 1714     const int32_t filt_const1 = 0x140014;
 
 1715     const int32_t filt_const2 = 0x1fffb;
 
 1717     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
 
 1719     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1720     v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1721     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 1722     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
 
 1723     v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
 
 1724     v8i16 hz_out87_l, filt0, filt1, filt2;
 
 1727     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 1728     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 1729     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 1733     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 1747         for (loop_cnt = 4; loop_cnt--;) {
 
 1758             ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 1759                        hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
 
 1761             ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 1762                        hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
 
 1764             ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 1765                        hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
 
 1767             ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 1768                        hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
 
 1775             dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1780             dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1785             dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1790             dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1792             dst1 = __msa_srari_h(hz_out3, 5);
 
 1793             dst3 = __msa_srari_h(hz_out4, 5);
 
 1794             dst5 = __msa_srari_h(hz_out5, 5);
 
 1795             dst7 = __msa_srari_h(hz_out6, 5);
 
 1798             dst0 = __msa_aver_s_h(dst0, dst1);
 
 1799             dst1 = __msa_aver_s_h(dst2, dst3);
 
 1800             dst2 = __msa_aver_s_h(dst4, dst5);
 
 1801             dst3 = __msa_aver_s_h(dst6, dst7);
 
 1823     const int32_t filt_const0 = 0xfffb0001;
 
 1824     const int32_t filt_const1 = 0x140014;
 
 1825     const int32_t filt_const2 = 0x1fffb;
 
 1827     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1828     v16i8 src11, src12, mask0, mask1, mask2;
 
 1829     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1830     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
 
 1831     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 1832     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
 
 1833     v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
 
 1834     v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
 
 1835     v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
 
 1836     v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
 
 1841     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 1842     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 1843     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 1866     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 1867                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 1868     ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 1869                hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
 
 1870     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 1871                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 1872     ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 1873                hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
 
 1875     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 1877     tmp1 = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
 
 1879     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1880     tmp0 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 1882     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
 
 1884     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1885     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 1887     tmp1 = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
 
 1889     dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1890     tmp0 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 1892     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
 
 1894     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1896     SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
 
 1897     SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
 
 1899     dst0 = __msa_aver_s_h(dst0, hz_out2);
 
 1900     dst1 = __msa_aver_s_h(dst1, hz_out3);
 
 1901     dst2 = __msa_aver_s_h(dst2, hz_out4);
 
 1902     dst3 = __msa_aver_s_h(dst3, hz_out5);
 
 1915     ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 1916                hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
 
 1918     ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 1919                hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
 
 1921     tmp0 = 
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
 
 1923     tmp1 = 
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
 
 1925     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1926     tmp0 = 
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
 
 1928     tmp1 = 
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
 
 1930     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1931     tmp0 = 
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
 
 1933     tmp1 = 
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
 
 1935     dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1936     tmp0 = 
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
 
 1938     tmp1 = 
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
 
 1940     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 1942     SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
 
 1943     SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
 
 1945     dst0 = __msa_aver_s_h(dst0, hz_out6);
 
 1946     dst1 = __msa_aver_s_h(dst1, hz_out7);
 
 1947     dst2 = __msa_aver_s_h(dst2, hz_out8);
 
 1948     dst3 = __msa_aver_s_h(dst3, hz_out9);
 
 1958     const int32_t filt_const0 = 0xfffb0001;
 
 1959     const int32_t filt_const1 = 0x140014;
 
 1960     const int32_t filt_const2 = 0x1fffb;
 
 1962     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1963     v16i8 src11, src12, mask0, mask1, mask2;
 
 1964     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1965     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
 
 1966     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 1967     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
 
 1968     v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
 
 1969     v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
 
 1970     v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
 
 1971     v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
 
 1976     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 1977     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 1978     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 2001     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 2002                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 2003     ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 2004                hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
 
 2005     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 2006                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 2007     ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 2008                hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
 
 2010     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 2012     tmp1 = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
 
 2014     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2015     tmp0 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 2017     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
 
 2019     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2020     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 2022     tmp1 = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
 
 2024     dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2025     tmp0 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 2027     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
 
 2029     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2031     SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
 
 2032     SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
 
 2034     dst0 = __msa_aver_s_h(dst0, hz_out3);
 
 2035     dst1 = __msa_aver_s_h(dst1, hz_out4);
 
 2036     dst2 = __msa_aver_s_h(dst2, hz_out5);
 
 2037     dst3 = __msa_aver_s_h(dst3, hz_out6);
 
 2050     ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 2051                hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
 
 2053     ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 2054                hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
 
 2056     tmp0 = 
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
 
 2058     tmp1 = 
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
 
 2060     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2061     tmp0 = 
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
 
 2063     tmp1 = 
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
 
 2065     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2066     tmp0 = 
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
 
 2068     tmp1 = 
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
 
 2070     dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2071     tmp0 = 
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
 
 2073     tmp1 = 
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
 
 2075     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2077     SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
 
 2078     SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
 
 2080     dst0 = __msa_aver_s_h(dst0, hz_out7);
 
 2081     dst1 = __msa_aver_s_h(dst1, hz_out8);
 
 2082     dst2 = __msa_aver_s_h(dst2, hz_out9);
 
 2083     dst3 = __msa_aver_s_h(dst3, hz_out10);
 
 2093     const int32_t filt_const0 = 0xfffb0001;
 
 2094     const int32_t filt_const1 = 0x140014;
 
 2095     const int32_t filt_const2 = 0x1fffb;
 
 2097     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2098     v16i8 mask0, mask1, mask2;
 
 2099     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 2100     v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
 
 2101     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 2102     v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
 
 2107     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 2108     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 2109     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 2125     PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
 
 2126     PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
 
 2128     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 2129                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 2130     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 2131                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 2133     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 2135     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 2137     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2138     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 2140     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 2142     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2147     dst0 = __msa_aver_s_h(dst0, hz_out2);
 
 2148     dst1 = __msa_aver_s_h(dst1, hz_out4);
 
 2157     const int32_t filt_const0 = 0xfffb0001;
 
 2158     const int32_t filt_const1 = 0x140014;
 
 2159     const int32_t filt_const2 = 0x1fffb;
 
 2161     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2162     v16i8 mask0, mask1, mask2;
 
 2163     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 2164     v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
 
 2165     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 2166     v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
 
 2171     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 2172     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 2173     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 2189     PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
 
 2190     PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
 
 2192     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 2193                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 2194     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 2195                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 2197     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 2199     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 2201     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2202     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 2204     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 2206     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2208     PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
 
 2212     dst0 = __msa_aver_s_h(dst0, hz_out0);
 
 2213     dst1 = __msa_aver_s_h(dst1, hz_out1);
 
 2223     int16_t filt_const0 = 0xfb01;
 
 2224     int16_t filt_const1 = 0x1414;
 
 2225     int16_t filt_const2 = 0x1fb;
 
 2226     v16u8 res0, res1, res2, res3;
 
 2227     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2228     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 2229     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 
 2230     v16i8 src65_l, src87_l, filt0, filt1, filt2;
 
 2231     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 2233     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 2234     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 2235     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 2242     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 2244     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 2247     for (loop_cnt = 4; loop_cnt--;) {
 
 2252         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
 
 2253                    src65_r, src76_r, src87_r);
 
 2254         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
 
 2255                    src65_l, src76_l, src87_l);
 
 2256         out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 2257         out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 2258         out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 2259         out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 2260         out0_l = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 2261         out1_l = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 2262         out2_l = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 2263         out3_l = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 2265         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 2267         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 2268         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 2269                     out3_r, res0, res1, res2, res3);
 
 2289     const int16_t filt_const0 = 0xfb01;
 
 2290     const int16_t filt_const1 = 0x1414;
 
 2291     const int16_t filt_const2 = 0x1fb;
 
 2292     v16u8 out0, out1, out2, out3;
 
 2293     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2294     v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
 
 2295     v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
 
 2296     v16i8 filt0, filt1, filt2;
 
 2297     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
 
 2299     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 2300     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 2301     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 2308     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 2310     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
 
 2312     ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
 
 2313                src910_r, src1110_r, src1211_r);
 
 2317     out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
 
 2318     out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
 
 2319     out2_r = 
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
 
 2320     out3_r = 
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
 
 2321     out4_r = 
AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
 
 2322     out5_r = 
AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
 
 2323     out6_r = 
AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
 
 2324     out7_r = 
AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
 
 2327     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 2328     SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
 
 2333     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 2339     const int16_t filt_const0 = 0xfb01;
 
 2340     const int16_t filt_const1 = 0x1414;
 
 2341     const int16_t filt_const2 = 0x1fb;
 
 2343     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2344     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 2345     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
 
 2348     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 2349     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 2350     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 2358     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 2360     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 2362     ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
 
 2363                src76_r, src2110, src4332, src6554, src8776);
 
 2365     out10 = 
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
 
 2366     out32 = 
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
 
 2378     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2380     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
 
 2381     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 2382     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
 
 2383     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 2384     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 2385     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 2386     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 2387     v8i16 minus5h = __msa_ldi_h(-5);
 
 2388     v8i16 plus20h = __msa_ldi_h(20);
 
 2402     for (row = 16; row--;) {
 
 2411         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2412                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 2413         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2414                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 2415         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 2416                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 2417         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 2418                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 2419         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2420         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2421         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 2422         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 2423         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 2424         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 2425         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 2426         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 2427         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 2428         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 2429         dst0 = __msa_srari_h(shf_vec2, 5);
 
 2430         dst1 = __msa_srari_h(shf_vec5, 5);
 
 2431         dst2 = __msa_srari_h(shf_vec8, 5);
 
 2432         dst3 = __msa_srari_h(shf_vec11, 5);
 
 2435         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
 
 2436         dst0 = __msa_aver_s_h(dst2, dst0);
 
 2437         dst1 = __msa_aver_s_h(dst3, dst1);
 
 2460     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2462     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
 
 2463     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 2464     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
 
 2465     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 2466     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 2467     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 2468     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 2469     v8i16 minus5h = __msa_ldi_h(-5);
 
 2470     v8i16 plus20h = __msa_ldi_h(20);
 
 2484     for (row = 16; row--;) {
 
 2493         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2494                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 2495         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2496                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 2497         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 2498                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 2499         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 2500                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 2501         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2502         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2503         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 2504         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 2505         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 2506         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 2507         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 2508         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 2509         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 2510         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 2511         dst0 = __msa_srari_h(shf_vec2, 5);
 
 2512         dst1 = __msa_srari_h(shf_vec5, 5);
 
 2513         dst2 = __msa_srari_h(shf_vec8, 5);
 
 2514         dst3 = __msa_srari_h(shf_vec11, 5);
 
 2516         dst0 = __msa_pckod_h(dst2, dst0);
 
 2517         dst1 = __msa_pckod_h(dst3, dst1);
 
 2518         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
 
 2519         dst0 = __msa_aver_s_h(dst2, dst0);
 
 2520         dst1 = __msa_aver_s_h(dst3, dst1);
 
 2544     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
 
 2545     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 2546     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
 
 2547     v8i16 mask3, mask4, mask5;
 
 2548     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 2549     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 2550     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 2551     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 2552     v8i16 minus5h = __msa_ldi_h(-5);
 
 2553     v8i16 plus20h = __msa_ldi_h(20);
 
 2565     for (row = 4; row--;) {
 
 2574         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2575                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 2576         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2577                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 2578         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 2579                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 2580         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 2581                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 2582         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2583         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2584         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 2585         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 2586         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 2587         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 2588         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 2589         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 2590         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 2591         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 2592         dst0 = __msa_srari_h(shf_vec2, 5);
 
 2593         dst1 = __msa_srari_h(shf_vec5, 5);
 
 2594         dst2 = __msa_srari_h(shf_vec8, 5);
 
 2595         dst3 = __msa_srari_h(shf_vec11, 5);
 
 2598         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
 
 2599         dst0 = __msa_aver_s_h(dst2, dst0);
 
 2600         dst1 = __msa_aver_s_h(dst3, dst1);
 
 2619     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
 
 2620     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 2621     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
 
 2622     v8i16 mask3, mask4, mask5;
 
 2623     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 2624     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 2625     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 2626     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 2627     v8i16 minus5h = __msa_ldi_h(-5);
 
 2628     v8i16 plus20h = __msa_ldi_h(20);
 
 2640     for (row = 4; row--;) {
 
 2649         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2650                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 2651         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2652                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 2653         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 2654                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 2655         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 2656                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 2657         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2658         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2659         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 2660         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 2661         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 2662         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 2663         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 2664         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 2665         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 2666         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 2667         dst0 = __msa_srari_h(shf_vec2, 5);
 
 2668         dst1 = __msa_srari_h(shf_vec5, 5);
 
 2669         dst2 = __msa_srari_h(shf_vec8, 5);
 
 2670         dst3 = __msa_srari_h(shf_vec11, 5);
 
 2672         dst0 = __msa_pckod_h(dst2, dst0);
 
 2673         dst1 = __msa_pckod_h(dst3, dst1);
 
 2674         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
 
 2675         dst0 = __msa_aver_s_h(dst2, dst0);
 
 2676         dst1 = __msa_aver_s_h(dst3, dst1);
 
 2692     const int16_t filt_const0 = 0xfb01;
 
 2693     const int16_t filt_const1 = 0x1414;
 
 2694     const int16_t filt_const2 = 0x1fb;
 
 2696     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2697     v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
 
 2698     v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
 
 2699     v16i8 src76_l, src87_l, filt0, filt1, filt2;
 
 2700     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
 
 2701     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 2702     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 2703     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 2704     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 2705     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 2706     v8i16 minus5h = __msa_ldi_h(-5);
 
 2707     v8i16 plus20h = __msa_ldi_h(20);
 
 2708     v8i16 zeros = { 0 };
 
 2710     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 2711     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 2712     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 2722     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 2724     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 2726     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 2728     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
 
 2730     vt_res0 = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 2731     vt_res1 = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 2732     vt_res2 = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 2733     vt_res3 = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 2734     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2735                mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 2736     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2737                mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 2738     hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2739     DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 2740     hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2741     DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 2743     vt_res0 = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 2744     vt_res1 = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 2745     vt_res2 = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 2746     vt_res3 = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 2747     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2748                mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
 
 2749     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2750                mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
 
 2751     hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2752     DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
 
 2753     hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2754     DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
 
 2761     dst0 = __msa_srari_h(shf_vec2, 5);
 
 2762     dst1 = __msa_srari_h(shf_vec5, 5);
 
 2763     dst2 = __msa_srari_h(shf_vec6, 5);
 
 2764     dst3 = __msa_srari_h(shf_vec7, 5);
 
 2768     ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
 
 2769     ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
 
 2771     hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
 
 2772     hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
 
 2773     hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
 
 2774     hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
 
 2776     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
 
 2784     const int16_t filt_const0 = 0xfb01;
 
 2785     const int16_t filt_const1 = 0x1414;
 
 2786     const int16_t filt_const2 = 0x1fb;
 
 2788     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 2789     v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
 
 2790     v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
 
 2791     v16i8 src76_l, src87_l, filt0, filt1, filt2;
 
 2792     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
 
 2793     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 2794     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 2795     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 2796     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 2797     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 2798     v8i16 minus5h = __msa_ldi_h(-5);
 
 2799     v8i16 plus20h = __msa_ldi_h(20);
 
 2800     v8i16 zeros = { 0 };
 
 2802     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 2803     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 2804     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 2814     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 2816     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 2818     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 2820     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
 
 2823     vt_res0 = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 2824     vt_res1 = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 2825     vt_res2 = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 2826     vt_res3 = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 2827     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2828                mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 2829     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2830                mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 2831     hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2832     DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 2833     hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2834     DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 2836     vt_res0 = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 2837     vt_res1 = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 2838     vt_res2 = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 2839     vt_res3 = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 2840     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 2841                mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
 
 2842     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 2843                mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
 
 2844     hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 2845     DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
 
 2846     hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 2847     DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
 
 2854     dst0 = __msa_srari_h(shf_vec2, 5);
 
 2855     dst1 = __msa_srari_h(shf_vec5, 5);
 
 2856     dst2 = __msa_srari_h(shf_vec6, 5);
 
 2857     dst3 = __msa_srari_h(shf_vec7, 5);
 
 2862     dst0 = __msa_ilvod_h(zeros, dst0);
 
 2863     dst1 = __msa_ilvod_h(zeros, dst1);
 
 2864     dst2 = __msa_ilvod_h(zeros, dst2);
 
 2865     dst3 = __msa_ilvod_h(zeros, dst3);
 
 2867     hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
 
 2868     hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
 
 2869     hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
 
 2870     hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
 
 2872     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
 
 2880     const int32_t filt_const0 = 0xfffb0001;
 
 2881     const int32_t filt_const1 = 0x140014;
 
 2882     const int32_t filt_const2 = 0x1fffb;
 
 2883     const uint8_t *src_tmp = 
src - (2 * 
stride) - 2;
 
 2884     uint8_t *dst_tmp = dst;
 
 2885     uint32_t multiple8_cnt, loop_cnt;
 
 2887     v16i8 
src0, 
src1, 
src2, src3, src4, mask0, mask1, mask2;
 
 2888     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 2889     v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
 
 2890     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 2891     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
 
 2892     v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
 
 2893     v8i16 hz_out87_l, filt0, filt1, filt2;
 
 2896     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 2897     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 2898     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 2902     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 2916         for (loop_cnt = 4; loop_cnt--;) {
 
 2926             ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 2927                        hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
 
 2929             ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 2930                        hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
 
 2932             ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 2933                        hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
 
 2935             ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 2936                        hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
 
 2943             dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2948             dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2953             dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2958             dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 2980     const int32_t filt_const0 = 0xfffb0001;
 
 2981     const int32_t filt_const1 = 0x140014;
 
 2982     const int32_t filt_const2 = 0x1fffb;
 
 2984     v16i8 
src0, 
src1, 
src2, src3, src4, mask0, mask1, mask2;
 
 2985     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 2986     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
 
 2987     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 2988     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
 
 2989     v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
 
 2990     v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
 
 2991     v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
 
 2992     v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
 
 2995     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 2996     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 2997     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 3019     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 3020                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 3021     ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 3022                hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
 
 3023     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 3024                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 3025     ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 3026                hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
 
 3028     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 3030     tmp1 = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
 
 3032     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3033     tmp0 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 3035     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
 
 3037     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3038     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 3040     tmp1 = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
 
 3042     dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3043     tmp0 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 3045     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
 
 3047     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3059     ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 3060                hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
 
 3062     ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 3063                hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
 
 3065     tmp0 = 
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
 
 3067     tmp1 = 
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
 
 3069     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3070     tmp0 = 
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
 
 3072     tmp1 = 
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
 
 3074     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3075     tmp0 = 
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
 
 3077     tmp1 = 
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
 
 3079     dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3080     tmp0 = 
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
 
 3082     tmp1 = 
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
 
 3084     dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3093     const int32_t filt_const0 = 0xfffb0001;
 
 3094     const int32_t filt_const1 = 0x140014;
 
 3095     const int32_t filt_const2 = 0x1fffb;
 
 3097     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 3098     v16i8 mask0, mask1, mask2;
 
 3099     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 3100     v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
 
 3101     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 3102     v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
 
 3107     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 3108     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 3109     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 3124     PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
 
 3125     PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
 
 3126     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 3127                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 3128     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 3129                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 3131     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 3133     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 3135     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3136     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 3138     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 3140     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 3149     v16u8 dst0, dst1, dst2, dst3;
 
 3150     v16i8 out0, out1, out2, out3, 
src0, 
src1, 
src2, src3, src4, src5, src6;
 
 3151     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
 
 3152     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 3153     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 3154     v16i8 minus5b = __msa_ldi_b(-5);
 
 3155     v16i8 plus20b = __msa_ldi_b(20);
 
 3163     for (loop_cnt = 4; loop_cnt--;) {
 
 3181         HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
 
 3182         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 3183                      minus5b, res0, res1, res2, res3);
 
 3184         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 3185                      plus20b, res0, res1, res2, res3);
 
 3186         VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
 
 3187         VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
 
 3188         VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
 
 3189         VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
 
 3190         VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
 
 3191         VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
 
 3192         HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
 
 3193         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 3194                      minus5b, res4, res5, res6, res7);
 
 3195         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 3196                      plus20b, res4, res5, res6, res7);
 
 3205         out0 = __msa_aver_s_b(out0, 
src0);
 
 3206         out1 = __msa_aver_s_b(out1, 
src2);
 
 3207         out2 = __msa_aver_s_b(out2, src4);
 
 3208         out3 = __msa_aver_s_b(out3, src6);
 
 3221     v16u8 dst0, dst1, dst2, dst3;
 
 3222     v16i8 out0, out1, out2, out3, 
src0, 
src1, 
src2, src3, src4, src5, src6;
 
 3223     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
 
 3224     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 3225     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 3226     v16i8 minus5b = __msa_ldi_b(-5);
 
 3227     v16i8 plus20b = __msa_ldi_b(20);
 
 3235     for (loop_cnt = 4; loop_cnt--;) {
 
 3253         HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
 
 3254         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 3255                      minus5b, res0, res1, res2, res3);
 
 3256         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 3257                      plus20b, res0, res1, res2, res3);
 
 3258         VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
 
 3259         VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
 
 3260         VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
 
 3261         VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
 
 3262         VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
 
 3263         VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
 
 3264         HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
 
 3265         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 3266                      minus5b, res4, res5, res6, res7);
 
 3267         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 3268                      plus20b, res4, res5, res6, res7);
 
 3277         out0 = __msa_aver_s_b(out0, 
src0);
 
 3278         out1 = __msa_aver_s_b(out1, 
src2);
 
 3279         out2 = __msa_aver_s_b(out2, src4);
 
 3280         out3 = __msa_aver_s_b(out3, src6);
 
 3292     uint64_t tp0, tp1, tp2, tp3;
 
 3293     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 3294     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
 3295     v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
 
 3296     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 3297     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 3298     v16i8 minus5b = __msa_ldi_b(-5);
 
 3299     v16i8 plus20b = __msa_ldi_b(20);
 
 3306     HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
 
 3309     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 3310                  res0, res1, res2, res3);
 
 3313     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
 3314                  res0, res1, res2, res3);
 
 3315     VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
 
 3316     VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
 
 3317     HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
 
 3318     VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
 
 3319     VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
 
 3320     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 3321                  res4, res5, res6, res7);
 
 3322     VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
 
 3323     VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 
 3324     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
 3325                  res4, res5, res6, res7);
 
 3328     SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
 
 3329                src4, src5, src6, src7);
 
 3338     tmp0 = __msa_aver_s_b(tmp0, 
src0);
 
 3339     tmp1 = __msa_aver_s_b(tmp1, 
src1);
 
 3340     tmp2 = __msa_aver_s_b(tmp2, src4);
 
 3341     tmp3 = __msa_aver_s_b(tmp3, src5);
 
 3351     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 3357     uint64_t tp0, tp1, tp2, tp3;
 
 3358     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 3359     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
 3360     v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
 
 3361     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 3362     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 3363     v16i8 minus5b = __msa_ldi_b(-5);
 
 3364     v16i8 plus20b = __msa_ldi_b(20);
 
 3371     HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
 
 3374     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 3375                  res0, res1, res2, res3);
 
 3378     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
 3379                  res0, res1, res2, res3);
 
 3380     VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
 
 3381     VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
 
 3382     HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
 
 3383     VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
 
 3384     VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
 
 3385     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 3386                  res4, res5, res6, res7);
 
 3387     VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
 
 3388     VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 
 3389     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
 3390                  res4, res5, res6, res7);
 
 3393     SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
 
 3394                src4, src5, src6, src7);
 
 3403     tmp0 = __msa_aver_s_b(tmp0, 
src0);
 
 3404     tmp1 = __msa_aver_s_b(tmp1, 
src1);
 
 3405     tmp2 = __msa_aver_s_b(tmp2, src4);
 
 3406     tmp3 = __msa_aver_s_b(tmp3, src5);
 
 3416     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 3422     uint32_t tp0, tp1, tp2, tp3;
 
 3424     v16i8 
src0, 
src1, 
src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
 
 3425     v16i8 mask0, mask1, mask2;
 
 3427     v16i8 minus5b = __msa_ldi_b(-5);
 
 3428     v16i8 plus20b = __msa_ldi_b(20);
 
 3436     DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
 
 3438     DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
 
 3441     res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
 
 3444     src0 = (v16i8) __msa_insve_w((v4i32) 
src0, 1, (v4i32) 
src1);
 
 3445     src1 = (v16i8) __msa_insve_w((v4i32) 
src2, 1, (v4i32) src3);
 
 3446     src0 = (v16i8) __msa_insve_d((v2i64) 
src0, 1, (v2i64) 
src1);
 
 3447     res = __msa_aver_s_b(res, 
src0);
 
 3448     res = (v16i8) __msa_xori_b((v16u8) res, 128);
 
 3451     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 3458     uint32_t tp0, tp1, tp2, tp3;
 
 3460     v16i8 
src0, 
src1, 
src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
 
 3461     v16i8 mask0, mask1, mask2;
 
 3463     v16i8 minus5b = __msa_ldi_b(-5);
 
 3464     v16i8 plus20b = __msa_ldi_b(20);
 
 3472     DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
 
 3474     DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
 
 3477     res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
 
 3480     src0 = (v16i8) __msa_insve_w((v4i32) 
src0, 1, (v4i32) 
src1);
 
 3481     src1 = (v16i8) __msa_insve_w((v4i32) 
src2, 1, (v4i32) src3);
 
 3482     src0 = (v16i8) __msa_insve_d((v2i64) 
src0, 1, (v2i64) 
src1);
 
 3483     res = __msa_aver_s_b(res, 
src0);
 
 3484     res = (v16i8) __msa_xori_b((v16u8) res, 128);
 
 3487     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 3495     v16u8 dst0, dst1, dst2, dst3;
 
 3496     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
 3497     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 3499     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 3500     v16i8 minus5b = __msa_ldi_b(-5);
 
 3501     v16i8 plus20b = __msa_ldi_b(20);
 
 3506     for (loop_cnt = 4; loop_cnt--;) {
 
 3524         HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
 
 3525         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 3526                      minus5b, res0, res1, res2, res3);
 
 3527         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 3528                      plus20b, res0, res1, res2, res3);
 
 3529         VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
 
 3530         VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
 
 3531         VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
 
 3532         VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
 
 3533         VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
 
 3534         VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
 
 3535         HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
 
 3536         DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 
 3537                      minus5b, res4, res5, res6, res7);
 
 3538         DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 
 3539                      plus20b, res4, res5, res6, res7);
 
 3544         PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
 
 3557     uint64_t tp0, tp1, tp2, tp3;
 
 3558     v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
 
 3559     v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
 
 3560     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
 
 3561     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 3563     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 3564     v16i8 minus5b = __msa_ldi_b(-5);
 
 3565     v16i8 plus20b = __msa_ldi_b(20);
 
 3573     HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
 
 3576     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 3577                  res0, res1, res2, res3);
 
 3580     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
 3581                  res0, res1, res2, res3);
 
 3582     VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
 
 3583     VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
 
 3584     HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
 
 3585     VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
 
 3586     VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
 
 3587     DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
 
 3588                  res4, res5, res6, res7);
 
 3589     VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
 
 3590     VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
 
 3591     DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
 
 3592                  res4, res5, res6, res7);
 
 3609     ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 3615     uint32_t tp0, tp1, tp2, tp3;
 
 3616     v16u8 res, dst0 = { 0 };
 
 3617     v16i8 
src0, 
src1, 
src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
 
 3618     v16i8 mask0, mask1, mask2;
 
 3620     v16i8 minus5b = __msa_ldi_b(-5);
 
 3621     v16i8 plus20b = __msa_ldi_b(20);
 
 3629     DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
 
 3631     DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
 
 3637     res = __msa_aver_u_b(res, dst0);
 
 3645     int16_t filt_const0 = 0xfb01;
 
 3646     int16_t filt_const1 = 0x1414;
 
 3647     int16_t filt_const2 = 0x1fb;
 
 3648     v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
 
 3649     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 3650     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 3651     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 
 3652     v16i8 src65_l, src87_l, filt0, filt1, filt2;
 
 3653     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 3655     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 3656     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 3657     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 3665     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 3667     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 3670     for (loop_cnt = 4; loop_cnt--;) {
 
 3675         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
 
 3676                    src65_r, src76_r, src87_r);
 
 3677         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
 
 3678                    src65_l, src76_l, src87_l);
 
 3679         out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 3680         out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 3681         out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 3682         out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 3683         out0_l = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 3684         out1_l = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 3685         out2_l = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 3686         out3_l = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 3688         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 3690         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 3691         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 3692                     out3_r, res0, res1, res2, res3);
 
 3693         res0 = (v16u8) __msa_aver_s_b((v16i8) res0, 
src2);
 
 3694         res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
 
 3695         res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
 
 3696         res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
 
 3722     int16_t filt_const0 = 0xfb01;
 
 3723     int16_t filt_const1 = 0x1414;
 
 3724     int16_t filt_const2 = 0x1fb;
 
 3725     v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
 
 3726     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 3727     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 3728     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 
 3729     v16i8 src65_l, src87_l, filt0, filt1, filt2;
 
 3730     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 3732     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 3733     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 3734     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 3742     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 3744     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 3747     for (loop_cnt = 4; loop_cnt--;) {
 
 3752         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
 
 3753                    src65_r, src76_r, src87_r);
 
 3754         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
 
 3755                    src65_l, src76_l, src87_l);
 
 3756         out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 3757         out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 3758         out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 3759         out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 3760         out0_l = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 3761         out1_l = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 3762         out2_l = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 3763         out3_l = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 3765         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 3767         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 3768         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 3769                     out3_r, res0, res1, res2, res3);
 
 3770         res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
 
 3771         res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
 
 3772         res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
 
 3773         res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
 
 3797     uint64_t tp0, tp1, tp2, tp3;
 
 3798     const int16_t filt_const0 = 0xfb01;
 
 3799     const int16_t filt_const1 = 0x1414;
 
 3800     const int16_t filt_const2 = 0x1fb;
 
 3801     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 3802     v16i8 
src0, 
src1, 
src2, src3, src4, src7, src8, src9, src10, src11, src12;
 
 3803     v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
 
 3804     v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
 
 3805     v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
 
 3806     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
 
 3808     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 3809     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 3810     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 3818     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 3820     LD_SB8(
src, 
stride, src7, src8, src9, src10, src11, src12, src13, src14);
 
 3821     XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
 
 3822     ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
 
 3823                src87_r, src98_r, src109_r);
 
 3824     out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
 
 3825     out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
 
 3826     out2_r = 
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
 
 3827     out3_r = 
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
 
 3829     ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
 
 3830                src21_r, src32_r, src43_r);
 
 3831     out4_r = 
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
 
 3832     out5_r = 
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
 
 3833     out6_r = 
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
 
 3834     out7_r = 
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
 
 3835     PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
 
 3838     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 3839     SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
 
 3848     PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
 
 3849     PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
 
 3850     out0 = __msa_aver_s_b(out0, tmp0);
 
 3851     out1 = __msa_aver_s_b(out1, tmp1);
 
 3852     out2 = __msa_aver_s_b(out2, tmp2);
 
 3853     out3 = __msa_aver_s_b(out3, tmp3);
 
 3855     AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
 
 3857     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 3863     uint64_t tp0, tp1, tp2, tp3;
 
 3864     const int16_t filt_const0 = 0xfb01;
 
 3865     const int16_t filt_const1 = 0x1414;
 
 3866     const int16_t filt_const2 = 0x1fb;
 
 3867     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 3868     v16i8 
src0, 
src1, 
src2, src3, src4, src7, src8, src9, src10, src11, src12;
 
 3869     v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
 
 3870     v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
 
 3871     v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
 
 3872     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
 
 3874     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 3875     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 3876     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 3884     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 3886     LD_SB8(
src, 
stride, src7, src8, src9, src10, src11, src12, src13, src14);
 
 3887     XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
 
 3888     ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
 
 3889                src87_r, src98_r, src109_r);
 
 3890     out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
 
 3891     out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
 
 3892     out2_r = 
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
 
 3893     out3_r = 
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
 
 3895     ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
 
 3896                src21_r, src32_r, src43_r);
 
 3897     out4_r = 
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
 
 3898     out5_r = 
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
 
 3899     out6_r = 
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
 
 3900     out7_r = 
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
 
 3901     PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
 
 3904     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 3905     SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
 
 3914     PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
 
 3915     PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
 
 3916     out0 = __msa_aver_s_b(out0, tmp0);
 
 3917     out1 = __msa_aver_s_b(out1, tmp1);
 
 3918     out2 = __msa_aver_s_b(out2, tmp2);
 
 3919     out3 = __msa_aver_s_b(out3, tmp3);
 
 3921     AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
 
 3923     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 3929     uint32_t tp0, tp1, tp2, tp3;
 
 3930     int16_t filt_const0 = 0xfb01;
 
 3931     int16_t filt_const1 = 0x1414;
 
 3932     int16_t filt_const2 = 0x1fb;
 
 3933     v16u8 res, dst0 = { 0 };
 
 3934     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 3935     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 3936     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
 
 3939     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 3940     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 3941     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 3947     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 3949     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 3952     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 3954     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
 
 3956     src32_r = (v16i8) __msa_insve_w((v4i32) 
src2, 1, (v4i32) src3);
 
 3957     src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
 
 3958     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
 
 3959     out10 = 
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
 
 3960     out32 = 
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
 
 3966     res = __msa_aver_u_b(res, (v16u8) src32_r);
 
 3967     dst0 = __msa_aver_u_b(res, dst0);
 
 3974     uint32_t tp0, tp1, tp2, tp3;
 
 3975     int16_t filt_const0 = 0xfb01;
 
 3976     int16_t filt_const1 = 0x1414;
 
 3977     int16_t filt_const2 = 0x1fb;
 
 3978     v16u8 res, dst0 = { 0 };
 
 3979     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 3980     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 3981     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
 
 3984     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 3985     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 3986     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 3993     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 3995     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 3998     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 4000     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
 
 4002     out10 = 
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
 
 4003     out32 = 
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
 
 4009     src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
 
 4010     src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
 
 4011     src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
 
 4012     res = __msa_aver_u_b(res, (v16u8) src32_r);
 
 4013     dst0 = __msa_aver_u_b(res, dst0);
 
 4064                                          sizeof(uint8_t), dst, 
stride);
 
 4080                                          sizeof(uint8_t), dst, 
stride);
 
 4097                                          sizeof(uint8_t), dst, 
stride);
 
 4113                                          sizeof(uint8_t), dst, 
stride);
 
 4119     uint64_t tp0, tp1, tp2, tp3;
 
 4120     uint8_t *dst_tmp = dst;
 
 4121     const uint8_t *src_tmp = 
src - (2 * 
stride) - 2;
 
 4122     uint32_t multiple8_cnt, loop_cnt;
 
 4123     const int32_t filt_const0 = 0xfffb0001;
 
 4124     const int32_t filt_const1 = 0x140014;
 
 4125     const int32_t filt_const2 = 0x1fffb;
 
 4126     v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
 
 4127     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
 
 4129     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 4130     v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 4131     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 4132     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
 
 4133     v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
 
 4134     v8i16 hz_out87_l, filt0, filt1, filt2;
 
 4135     v4i32 tmp0_w, tmp1_w;
 
 4137     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 4138     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 4139     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 4143     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 4157         for (loop_cnt = 4; loop_cnt--;) {
 
 4164             ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 4165                        hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
 
 4167             ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 4168                        hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
 
 4170             ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
 
 4172             ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
 
 4174             tmp0_w = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
 
 4176             tmp1_w = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
 
 4178             tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4179             tmp0_w = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
 
 4181             tmp1_w = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
 
 4183             tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4185             tmp1 = __msa_srari_h(hz_out2, 5);
 
 4186             tmp3 = __msa_srari_h(hz_out3, 5);
 
 4189             tmp0 = __msa_aver_s_h(tmp0, tmp1);
 
 4190             tmp1 = __msa_aver_s_h(tmp2, tmp3);
 
 4196             dst0 = __msa_aver_u_b(out0, dst0);
 
 4206             ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
 
 4208             ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
 
 4210             tmp0_w = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
 
 4212             tmp1_w = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
 
 4214             tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4215             tmp0_w = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
 
 4217             tmp1_w = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
 
 4219             tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4221             tmp5 = __msa_srari_h(hz_out4, 5);
 
 4222             tmp7 = __msa_srari_h(hz_out5, 5);
 
 4225             tmp2 = __msa_aver_s_h(tmp4, tmp5);
 
 4226             tmp3 = __msa_aver_s_h(tmp6, tmp7);
 
 4232             dst1 = __msa_aver_u_b(out1, dst1);
 
 4251     uint64_t tp0, tp1, tp2, tp3;
 
 4252     uint8_t *dst_tmp = dst;
 
 4253     const uint8_t *src_tmp = 
src - (2 * 
stride) - 2;
 
 4254     uint32_t multiple8_cnt, loop_cnt;
 
 4255     const int32_t filt_const0 = 0xfffb0001;
 
 4256     const int32_t filt_const1 = 0x140014;
 
 4257     const int32_t filt_const2 = 0x1fffb;
 
 4258     v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
 
 4259     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
 
 4261     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 4262     v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 4263     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 4264     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
 
 4265     v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
 
 4266     v8i16 hz_out87_l, filt0, filt1, filt2;
 
 4267     v4i32 tmp0_w, tmp1_w;
 
 4269     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 4270     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 4271     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 4275     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 4289         for (loop_cnt = 4; loop_cnt--;) {
 
 4296             ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 4297                        hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
 
 4299             ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 4300                        hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
 
 4302             ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
 
 4303             ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
 
 4305             tmp0_w = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
 
 4307             tmp1_w = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
 
 4309             tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4310             tmp0_w = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
 
 4312             tmp1_w = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
 
 4314             tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4316             tmp1 = __msa_srari_h(hz_out3, 5);
 
 4317             tmp3 = __msa_srari_h(hz_out4, 5);
 
 4320             tmp0 = __msa_aver_s_h(tmp0, tmp1);
 
 4321             tmp1 = __msa_aver_s_h(tmp2, tmp3);
 
 4326             dst0 = __msa_aver_u_b(out0, dst0);
 
 4336             ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
 
 4338             ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
 
 4340             tmp0_w = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
 
 4342             tmp1_w = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
 
 4344             tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4345             tmp0_w = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
 
 4347             tmp1_w = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
 
 4349             tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4351             tmp5 = __msa_srari_h(hz_out5, 5);
 
 4352             tmp7 = __msa_srari_h(hz_out6, 5);
 
 4355             tmp2 = __msa_aver_s_h(tmp4, tmp5);
 
 4356             tmp3 = __msa_aver_s_h(tmp6, tmp7);
 
 4361             dst1 = __msa_aver_u_b(out1, dst1);
 
 4380     const int32_t filt_const0 = 0xfffb0001;
 
 4381     const int32_t filt_const1 = 0x140014;
 
 4382     const int32_t filt_const2 = 0x1fffb;
 
 4383     uint64_t tp0, tp1, tp2, tp3;
 
 4384     v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
 
 4385     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4386     v16i8 src11, src12, mask0, mask1, mask2;
 
 4387     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 4388     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
 
 4389     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 4390     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
 
 4391     v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
 
 4392     v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
 
 4393     v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
 
 4394     v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
 
 4395     v4i32 tmp0_w, tmp1_w;
 
 4399     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 4400     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 4401     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 4424     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 4425                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 4426     ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 4427                hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
 
 4428     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 4429                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 4430     ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 4431                hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
 
 4433     tmp0_w = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 4435     tmp1_w = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
 
 4437     tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4438     tmp0_w = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 4440     tmp1_w = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
 
 4442     tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4443     tmp0_w = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 4445     tmp1_w = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
 
 4447     tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4448     tmp0_w = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 4450     tmp1_w = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
 
 4452     tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4454     SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
 
 4455     SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
 
 4461     tmp0 = __msa_aver_s_h(tmp0, hz_out2);
 
 4462     tmp1 = __msa_aver_s_h(tmp1, hz_out3);
 
 4463     tmp2 = __msa_aver_s_h(tmp2, hz_out4);
 
 4464     tmp3 = __msa_aver_s_h(tmp3, hz_out5);
 
 4478     ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 4479                hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
 
 4481     ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 4482                hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
 
 4484     tmp0_w = 
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
 
 4486     tmp1_w = 
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
 
 4488     tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4489     tmp0_w = 
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
 
 4491     tmp1_w = 
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
 
 4493     tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4494     tmp0_w = 
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
 
 4496     tmp1_w = 
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
 
 4498     tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4499     tmp0_w = 
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
 
 4501     tmp1_w = 
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
 
 4503     tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4505     SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
 
 4506     SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
 
 4512     tmp0 = __msa_aver_s_h(tmp0, hz_out6);
 
 4513     tmp1 = __msa_aver_s_h(tmp1, hz_out7);
 
 4514     tmp2 = __msa_aver_s_h(tmp2, hz_out8);
 
 4515     tmp3 = __msa_aver_s_h(tmp3, hz_out9);
 
 4526     const int32_t filt_const0 = 0xfffb0001;
 
 4527     const int32_t filt_const1 = 0x140014;
 
 4528     const int32_t filt_const2 = 0x1fffb;
 
 4529     uint64_t tp0, tp1, tp2, tp3;
 
 4530     v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
 
 4531     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4532     v16i8 src11, src12, mask0, mask1, mask2;
 
 4533     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 4534     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
 
 4535     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 4536     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
 
 4537     v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
 
 4538     v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
 
 4539     v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
 
 4540     v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
 
 4541     v4i32 tmp0_w, tmp1_w;
 
 4545     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 4546     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 4547     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 4570     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 4571                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 4572     ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 4573                hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
 
 4574     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 4575                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 4576     ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 4577                hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
 
 4579     tmp0_w = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 4581     tmp1_w = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
 
 4583     tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4584     tmp0_w = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 4586     tmp1_w = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
 
 4588     tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4589     tmp0_w = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 4591     tmp1_w = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
 
 4593     tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4594     tmp0_w = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 4596     tmp1_w = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
 
 4598     tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4600     SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
 
 4601     SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
 
 4607     tmp0 = __msa_aver_s_h(tmp0, hz_out3);
 
 4608     tmp1 = __msa_aver_s_h(tmp1, hz_out4);
 
 4609     tmp2 = __msa_aver_s_h(tmp2, hz_out5);
 
 4610     tmp3 = __msa_aver_s_h(tmp3, hz_out6);
 
 4624     ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 4625                hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
 
 4627     ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 4628                hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
 
 4630     tmp0_w = 
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
 
 4632     tmp1_w = 
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
 
 4634     tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4635     tmp0_w = 
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
 
 4637     tmp1_w = 
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
 
 4639     tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4640     tmp0_w = 
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
 
 4642     tmp1_w = 
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
 
 4644     tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4645     tmp0_w = 
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
 
 4647     tmp1_w = 
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
 
 4649     tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
 
 4651     SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
 
 4652     SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
 
 4658     tmp0 = __msa_aver_s_h(tmp0, hz_out7);
 
 4659     tmp1 = __msa_aver_s_h(tmp1, hz_out8);
 
 4660     tmp2 = __msa_aver_s_h(tmp2, hz_out9);
 
 4661     tmp3 = __msa_aver_s_h(tmp3, hz_out10);
 
 4672     uint32_t tp0, tp1, tp2, tp3;
 
 4673     const int32_t filt_const0 = 0xfffb0001;
 
 4674     const int32_t filt_const1 = 0x140014;
 
 4675     const int32_t filt_const2 = 0x1fffb;
 
 4676     v16u8 res, 
out = { 0 };
 
 4677     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 4678     v16i8 mask0, mask1, mask2;
 
 4679     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 4680     v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
 
 4681     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 4682     v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
 
 4687     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 4688     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 4689     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 4705     PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
 
 4706     PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
 
 4708     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 4709                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 4710     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 4711                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 4713     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 4715     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 4717     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 4718     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 4720     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 4722     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 4727     dst0 = __msa_aver_s_h(dst0, hz_out2);
 
 4728     dst1 = __msa_aver_s_h(dst1, hz_out4);
 
 4732     res = __msa_aver_u_b(res, 
out);
 
 4739     const int32_t filt_const0 = 0xfffb0001;
 
 4740     const int32_t filt_const1 = 0x140014;
 
 4741     const int32_t filt_const2 = 0x1fffb;
 
 4742     uint32_t tp0, tp1, tp2, tp3;
 
 4743     v16u8 res, 
out = { 0 };
 
 4744     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 4745     v16i8 mask0, mask1, mask2;
 
 4746     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 4747     v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
 
 4748     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 4749     v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
 
 4754     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 4755     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 4756     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 4772     PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
 
 4773     PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
 
 4775     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 4776                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 4777     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 4778                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 4780     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 4782     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 4784     dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 4785     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 4787     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 4789     dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 4791     PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
 
 4795     dst0 = __msa_aver_s_h(dst0, hz_out0);
 
 4796     dst1 = __msa_aver_s_h(dst1, hz_out1);
 
 4800     res = __msa_aver_u_b(res, 
out);
 
 4808     int16_t filt_const0 = 0xfb01;
 
 4809     int16_t filt_const1 = 0x1414;
 
 4810     int16_t filt_const2 = 0x1fb;
 
 4811     v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
 
 4812     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 4813     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 4814     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 
 4815     v16i8 src65_l, src87_l, filt0, filt1, filt2;
 
 4816     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 4818     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 4819     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 4820     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 4827     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 4829     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 4832     for (loop_cnt = 4; loop_cnt--;) {
 
 4837         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
 
 4838                    src65_r, src76_r, src87_r);
 
 4839         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
 
 4840                    src65_l, src76_l, src87_l);
 
 4841         out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 4842         out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 4843         out2_r = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 4844         out3_r = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 4845         out0_l = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 4846         out1_l = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 4847         out2_l = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 4848         out3_l = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 4850         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 4852         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 4854         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 4855                     out3_r, res0, res1, res2, res3);
 
 4877     uint64_t tp0, tp1, tp2, tp3;
 
 4878     const int16_t filt_const0 = 0xfb01;
 
 4879     const int16_t filt_const1 = 0x1414;
 
 4880     const int16_t filt_const2 = 0x1fb;
 
 4881     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 4882     v16u8 out0, out1, out2, out3;
 
 4883     v16i8 
src0, 
src1, 
src2, src3, src4, src7, src8, src9, src10, src109_r;
 
 4884     v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
 
 4885     v16i8 filt0, filt1, filt2;
 
 4886     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
 
 4888     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 4889     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 4890     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 4898     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 4904     ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
 
 4905                src87_r, src98_r, src109_r);
 
 4906     out0_r = 
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
 
 4907     out1_r = 
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
 
 4908     out2_r = 
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
 
 4909     out3_r = 
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
 
 4914                src21_r, src32_r, src43_r);
 
 4915     out4_r = 
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
 
 4916     out5_r = 
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
 
 4917     out6_r = 
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
 
 4918     out7_r = 
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
 
 4929     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 4930     SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
 
 4935     AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
 
 4937     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 4943     uint32_t tp0, tp1, tp2, tp3;
 
 4944     int16_t filt_const0 = 0xfb01;
 
 4945     int16_t filt_const1 = 0x1414;
 
 4946     int16_t filt_const2 = 0x1fb;
 
 4947     v16u8 res, dst0 = { 0 };
 
 4948     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 4949     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 4950     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
 
 4953     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 4954     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 4955     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 4961     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 4963     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 4966     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 4968     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
 
 4970     out10 = 
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
 
 4971     out32 = 
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
 
 4977     dst0 = __msa_aver_u_b(res, dst0);
 
 4986     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4988     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
 
 4989     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 4990     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
 
 4991     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 4992     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 4993     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 4994     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 4995     v8i16 minus5h = __msa_ldi_h(-5);
 
 4996     v8i16 plus20h = __msa_ldi_h(20);
 
 5010     for (row = 16; row--;) {
 
 5020         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5021                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 5022         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5023                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 5024         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 5025                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 5026         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 5027                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 5028         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5029         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5030         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 5031         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 5032         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 5033         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 5034         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 5035         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 5036         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 5037         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 5038         tmp0 = __msa_srari_h(shf_vec2, 5);
 
 5039         tmp1 = __msa_srari_h(shf_vec5, 5);
 
 5040         tmp2 = __msa_srari_h(shf_vec8, 5);
 
 5041         tmp3 = __msa_srari_h(shf_vec11, 5);
 
 5044         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
 
 5045         tmp0 = __msa_aver_s_h(tmp2, tmp0);
 
 5046         tmp1 = __msa_aver_s_h(tmp3, tmp1);
 
 5048         out = __msa_aver_u_b(
out, dst0);
 
 5070     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 5072     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
 
 5073     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 5074     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
 
 5075     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 5076     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 5077     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 5078     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 5079     v8i16 minus5h = __msa_ldi_h(-5);
 
 5080     v8i16 plus20h = __msa_ldi_h(20);
 
 5094     for (row = 16; row--;) {
 
 5104         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5105                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 5106         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5107                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 5108         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 5109                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 5110         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 5111                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 5112         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5113         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5114         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 5115         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 5116         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 5117         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 5118         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 5119         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 5120         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 5121         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 5122         tmp0 = __msa_srari_h(shf_vec2, 5);
 
 5123         tmp1 = __msa_srari_h(shf_vec5, 5);
 
 5124         tmp2 = __msa_srari_h(shf_vec8, 5);
 
 5125         tmp3 = __msa_srari_h(shf_vec11, 5);
 
 5127         tmp0 = __msa_pckod_h(tmp2, tmp0);
 
 5128         tmp1 = __msa_pckod_h(tmp3, tmp1);
 
 5129         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
 
 5130         tmp0 = __msa_aver_s_h(tmp2, tmp0);
 
 5131         tmp1 = __msa_aver_s_h(tmp3, tmp1);
 
 5133         out = __msa_aver_u_b(
out, dst0);
 
 5155     v16u8 
out, dst0 = { 0 };
 
 5157     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
 
 5158     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 5159     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
 
 5160     v8i16 mask3, mask4, mask5;
 
 5161     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 5162     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 5163     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 5164     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 5165     v8i16 minus5h = __msa_ldi_h(-5);
 
 5166     v8i16 plus20h = __msa_ldi_h(20);
 
 5178     for (row = 4; row--;) {
 
 5187         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5188                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 5189         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5190                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 5191         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 5192                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 5193         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 5194                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 5195         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5196         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5197         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 5198         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 5199         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 5200         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 5201         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 5202         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 5203         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 5204         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 5205         tmp0 = __msa_srari_h(shf_vec2, 5);
 
 5206         tmp1 = __msa_srari_h(shf_vec5, 5);
 
 5207         tmp2 = __msa_srari_h(shf_vec8, 5);
 
 5208         tmp3 = __msa_srari_h(shf_vec11, 5);
 
 5213         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
 
 5214         tmp0 = __msa_aver_s_h(tmp2, tmp0);
 
 5215         tmp1 = __msa_aver_s_h(tmp3, tmp1);
 
 5217         out = __msa_aver_u_b(
out, dst0);
 
 5234     v16u8 
out, dst0 = { 0 };
 
 5236     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
 
 5237     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 5238     v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
 
 5239     v8i16 mask3, mask4, mask5;
 
 5240     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 5241     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 5242     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 5243     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 5244     v8i16 minus5h = __msa_ldi_h(-5);
 
 5245     v8i16 plus20h = __msa_ldi_h(20);
 
 5257     for (row = 4; row--;) {
 
 5266         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5267                    mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 5268         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5269                    mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 5270         VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
 
 5271                    mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
 
 5272         VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
 
 5273                    mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
 
 5274         hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5275         hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5276         hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
 
 5277         hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
 
 5278         DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 5279         DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 5280         DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
 
 5281         DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
 
 5282         SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
 
 5283         SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
 
 5284         tmp0 = __msa_srari_h(shf_vec2, 5);
 
 5285         tmp1 = __msa_srari_h(shf_vec5, 5);
 
 5286         tmp2 = __msa_srari_h(shf_vec8, 5);
 
 5287         tmp3 = __msa_srari_h(shf_vec11, 5);
 
 5291         tmp0 = __msa_pckod_h(tmp2, tmp0);
 
 5292         tmp1 = __msa_pckod_h(tmp3, tmp1);
 
 5293         PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
 
 5294         tmp0 = __msa_aver_s_h(tmp2, tmp0);
 
 5295         tmp1 = __msa_aver_s_h(tmp3, tmp1);
 
 5297         out = __msa_aver_u_b(
out, dst0);
 
 5312     uint32_t tp0, tp1, tp2, tp3;
 
 5313     const int16_t filt_const0 = 0xfb01;
 
 5314     const int16_t filt_const1 = 0x1414;
 
 5315     const int16_t filt_const2 = 0x1fb;
 
 5316     v16u8 
out, dstv = { 0 };
 
 5317     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 5318     v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
 
 5319     v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
 
 5320     v16i8 src76_l, src87_l, filt0, filt1, filt2;
 
 5321     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
 
 5322     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 5323     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 5324     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 5325     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 5326     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 5327     v8i16 minus5h = __msa_ldi_h(-5);
 
 5328     v8i16 plus20h = __msa_ldi_h(20);
 
 5329     v8i16 zeros = { 0 };
 
 5331     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 5332     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 5333     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 5343     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 5345     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 5347     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 5349     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
 
 5351     vt_res0 = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 5352     vt_res1 = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 5353     vt_res2 = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 5354     vt_res3 = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 5355     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5356                mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 5357     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5358                mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 5359     hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5360     DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 5361     hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5362     DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 5364     vt_res0 = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 5365     vt_res1 = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 5366     vt_res2 = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 5367     vt_res3 = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 5368     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5369                mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
 
 5370     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5371                mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
 
 5372     hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5373     DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
 
 5374     hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5375     DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
 
 5382     dst0 = __msa_srari_h(shf_vec2, 5);
 
 5383     dst1 = __msa_srari_h(shf_vec5, 5);
 
 5384     dst2 = __msa_srari_h(shf_vec6, 5);
 
 5385     dst3 = __msa_srari_h(shf_vec7, 5);
 
 5389     ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
 
 5390     ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
 
 5392     hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
 
 5393     hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
 
 5394     hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
 
 5395     hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
 
 5399     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
 
 5401     out = __msa_aver_u_b(
out, dstv);
 
 5408     uint32_t tp0, tp1, tp2, tp3;
 
 5409     const int16_t filt_const0 = 0xfb01;
 
 5410     const int16_t filt_const1 = 0x1414;
 
 5411     const int16_t filt_const2 = 0x1fb;
 
 5412     v16u8 
out, dstv = { 0 };
 
 5413     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 5414     v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
 
 5415     v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
 
 5416     v16i8 src76_l, src87_l, filt0, filt1, filt2;
 
 5417     v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
 
 5418     v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
 
 5419     v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
 
 5420     v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
 
 5421     v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
 
 5422     v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
 
 5423     v8i16 minus5h = __msa_ldi_h(-5);
 
 5424     v8i16 plus20h = __msa_ldi_h(20);
 
 5425     v8i16 zeros = { 0 };
 
 5427     filt0 = (v16i8) __msa_fill_h(filt_const0);
 
 5428     filt1 = (v16i8) __msa_fill_h(filt_const1);
 
 5429     filt2 = (v16i8) __msa_fill_h(filt_const2);
 
 5439     ILVR_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_r, src21_r,
 
 5441     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 5443     ILVL_B4_SB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, src10_l, src21_l,
 
 5445     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
 
 5447     vt_res0 = 
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
 
 5448     vt_res1 = 
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
 
 5449     vt_res2 = 
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
 
 5450     vt_res3 = 
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
 
 5451     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5452                mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
 
 5453     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5454                mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
 
 5455     hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5456     DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
 
 5457     hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5458     DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
 
 5460     vt_res0 = 
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
 
 5461     vt_res1 = 
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
 
 5462     vt_res2 = 
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
 
 5463     vt_res3 = 
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
 
 5464     VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
 
 5465                mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
 
 5466     VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
 
 5467                mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
 
 5468     hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
 
 5469     DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
 
 5470     hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
 
 5471     DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
 
 5478     dst0 = __msa_srari_h(shf_vec2, 5);
 
 5479     dst1 = __msa_srari_h(shf_vec5, 5);
 
 5480     dst2 = __msa_srari_h(shf_vec6, 5);
 
 5481     dst3 = __msa_srari_h(shf_vec7, 5);
 
 5486     dst0 = __msa_ilvod_h(zeros, dst0);
 
 5487     dst1 = __msa_ilvod_h(zeros, dst1);
 
 5488     dst2 = __msa_ilvod_h(zeros, dst2);
 
 5489     dst3 = __msa_ilvod_h(zeros, dst3);
 
 5491     hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
 
 5492     hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
 
 5493     hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
 
 5494     hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
 
 5498     PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
 
 5500     out = __msa_aver_u_b(
out, dstv);
 
 5507     const int32_t filt_const0 = 0xfffb0001;
 
 5508     const int32_t filt_const1 = 0x140014;
 
 5509     const int32_t filt_const2 = 0x1fffb;
 
 5510     const uint8_t *src_tmp = 
src - (2 * 
stride) - 2;
 
 5511     uint8_t *dst_tmp = dst;
 
 5512     uint64_t tp0, tp1, tp2, tp3;
 
 5513     uint32_t multiple8_cnt, loop_cnt;
 
 5514     v16u8 dst0, dst1, out0, out1;
 
 5515     v16i8 
src0, 
src1, 
src2, src3, src4, mask0, mask1, mask2;
 
 5516     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 5517     v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
 
 5518     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 5519     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
 
 5520     v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
 
 5521     v8i16 hz_out87_l, filt0, filt1, filt2;
 
 5524     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 5525     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 5526     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 5530     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 5544         for (loop_cnt = 4; loop_cnt--;) {
 
 5553             ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 5554                        hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
 
 5556             ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
 
 5557                        hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
 
 5559             ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 5560                        hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
 
 5562             ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
 
 5563                        hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
 
 5570             res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5575             res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5580             res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5585             res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5611     const int32_t filt_const0 = 0xfffb0001;
 
 5612     const int32_t filt_const1 = 0x140014;
 
 5613     const int32_t filt_const2 = 0x1fffb;
 
 5614     uint64_t tp0, tp1, tp2, tp3;
 
 5615     v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
 
 5616     v16i8 
src0, 
src1, 
src2, src3, src4, mask0, mask1, mask2;
 
 5617     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 5618     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
 
 5619     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 5620     v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
 
 5621     v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
 
 5622     v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
 
 5623     v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
 
 5624     v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
 
 5627     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 5628     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 5629     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 5651     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 5652                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 5653     ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 5654                hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
 
 5655     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 5656                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 5657     ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 5658                hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
 
 5660     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 5662     tmp1 = 
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
 
 5664     res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5665     tmp0 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 5667     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
 
 5669     res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5670     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 5672     tmp1 = 
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
 
 5674     res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5675     tmp0 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 5677     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
 
 5679     res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5695     ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 5696                hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
 
 5698     ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
 
 5699                hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
 
 5701     tmp0 = 
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
 
 5703     tmp1 = 
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
 
 5705     res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5706     tmp0 = 
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
 
 5708     tmp1 = 
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
 
 5710     res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5711     tmp0 = 
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
 
 5713     tmp1 = 
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
 
 5715     res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5716     tmp0 = 
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
 
 5718     tmp1 = 
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
 
 5720     res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5733     const int32_t filt_const0 = 0xfffb0001;
 
 5734     const int32_t filt_const1 = 0x140014;
 
 5735     const int32_t filt_const2 = 0x1fffb;
 
 5736     uint32_t tp0, tp1, tp2, tp3;
 
 5737     v16u8 res, dst0 = { 0 };
 
 5738     v16i8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 5739     v16i8 mask0, mask1, mask2;
 
 5740     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 5741     v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
 
 5742     v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
 
 5743     v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
 
 5748     filt0 = (v8i16) __msa_fill_w(filt_const0);
 
 5749     filt1 = (v8i16) __msa_fill_w(filt_const1);
 
 5750     filt2 = (v8i16) __msa_fill_w(filt_const2);
 
 5765     PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
 
 5766     PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
 
 5767     ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
 
 5768                hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
 
 5769     ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
 
 5770                hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
 
 5772     tmp0 = 
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
 
 5774     tmp1 = 
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
 
 5776     res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5777     tmp0 = 
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
 
 5779     tmp1 = 
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
 
 5781     res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
 
 5785     res = __msa_aver_u_b(res, dst0);