25     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
 
   26     0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
 
   27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   28     0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
 
   29     0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
 
   34                                   uint32_t coeff0, uint32_t coeff1)
 
   41     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
   42     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
   43     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
   47     LD_SB2(src, src_stride, src0, src1);
 
   49     src0 = __msa_vshf_b(mask, src1, src0);
 
   50     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 
   52     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
   53     res_r = __msa_sat_u_h(res_r, 7);
 
   54     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
   56     out0 = __msa_copy_u_h(res, 0);
 
   57     out1 = __msa_copy_u_h(res, 2);
 
   66                                   uint32_t coeff0, uint32_t coeff1)
 
   72     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
   73     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
   74     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
   78     LD_UB4(src, src_stride, src0, src1, src2, src3);
 
   80     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
   82     src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
 
   84     res_r = __msa_dotp_u_h(src0, coeff_vec);
 
   86     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
   87     res_r = __msa_sat_u_h(res_r, 7);
 
   88     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
   95                                   uint32_t coeff0, uint32_t coeff1)
 
   97     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  101     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  102     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  103     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  107     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
  109     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
  110     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
 
  112     ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
 
  114     res_r = __msa_dotp_u_h(src0, coeff_vec);
 
  116     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  117     res_r = __msa_sat_u_h(res_r, 7);
 
  118     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  121     dst += (4 * dst_stride);
 
  123     res_r = __msa_dotp_u_h(src4, coeff_vec);
 
  125     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  126     res_r = __msa_sat_u_h(res_r, 7);
 
  127     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  134                                  uint32_t coeff0, uint32_t coeff1,
 
  139     } 
else if (4 == height) {
 
  141     } 
else if (8 == height) {
 
  148                                   uint32_t coeff0, uint32_t coeff1)
 
  154     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  155     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  156     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  160     LD_SB2(src, src_stride, src0, src1);
 
  162     src0 = __msa_vshf_b(mask, src1, src0);
 
  163     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 
  165     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  166     res_r = __msa_sat_u_h(res_r, 7);
 
  167     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  174                                           uint32_t coeff0, uint32_t coeff1,
 
  179     v8u16 res0_r, res1_r;
 
  182     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  183     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  184     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  188     for (row = (height >> 2); row--;) {
 
  189         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  190         src += (4 * src_stride);
 
  192         VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
  193         DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
 
  200         PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
 
  202         ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
  203         dst += (4 * dst_stride);
 
  209                                  uint32_t coeff0, uint32_t coeff1,
 
  222                                  uint32_t coeff0, uint32_t coeff1,
 
  226     v16u8 
src0, 
src1, src2, src3, out0, out1;
 
  227     v8u16 res0, res1, res2, res3;
 
  229     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  230     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  231     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  235     for (row = height >> 2; row--;) {
 
  236         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  237         src += (4 * src_stride);
 
  239         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
 
  240         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
 
  241         DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 
  242                     coeff_vec, res0, res1, res2, res3);
 
  243         SLLI_4V(res0, res1, res2, res3, 3);
 
  247         ST8x4_UB(out0, out1, dst, dst_stride);
 
  248         dst += (4 * dst_stride);
 
  251     if (0 != (height % 4)) {
 
  252         for (row = (height % 4); row--;) {
 
  256             src0 = (v16u8) __msa_vshf_b(mask, (v16i8) 
src0, (v16i8) src0);
 
  258             res0 = __msa_dotp_u_h(src0, coeff_vec);
 
  260             res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
 
  261             res0 = __msa_sat_u_h(res0, 7);
 
  262             res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
 
  272                                   uint32_t coeff0, uint32_t coeff1)
 
  279     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  280     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  281     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  283     LD_SB3(src, src_stride, src0, src1, src2);
 
  285     ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
 
  287     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
  289     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  291     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  292     res_r = __msa_sat_u_h(res_r, 7);
 
  293     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  295     out0 = __msa_copy_u_h(res, 0);
 
  296     out1 = __msa_copy_u_h(res, 2);
 
  305                                   uint32_t coeff0, uint32_t coeff1)
 
  308     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  311     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  312     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  313     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  315     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
  316     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
  317                tmp0, tmp1, tmp2, tmp3);
 
  318     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  320     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
  322     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  324     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  325     res_r = __msa_sat_u_h(res_r, 7);
 
  327     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  334                                   uint32_t coeff0, uint32_t coeff1)
 
  336     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
  337     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  340     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  341     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  342     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  344     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
  345     src += (5 * src_stride);
 
  346     LD_UB4(src, src_stride, src5, src6, src7, src8);
 
  348     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
  349                tmp0, tmp1, tmp2, tmp3);
 
  350     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  352     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
  354     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  356     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  357     res_r = __msa_sat_u_h(res_r, 7);
 
  359     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  362     dst += (4 * dst_stride);
 
  364     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
  365                tmp0, tmp1, tmp2, tmp3);
 
  366     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  368     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
  370     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  372     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  373     res_r = __msa_sat_u_h(res_r, 7);
 
  375     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  378     dst += (4 * dst_stride);
 
  383                                  uint32_t coeff0, uint32_t coeff1,
 
  388     } 
else if (4 == height) {
 
  390     } 
else if (8 == height) {
 
  397                                   uint32_t coeff0, uint32_t coeff1)
 
  403     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  404     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  405     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  407     LD_UB3(src, src_stride, src0, src1, src2);
 
  408     ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
 
  410     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
  411     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  413     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  414     res_r = __msa_sat_u_h(res_r, 7);
 
  415     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  422                                           uint32_t coeff0, uint32_t coeff1,
 
  427     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  428     v8u16 res0_r, res1_r;
 
  430     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  431     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  432     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  437     for (row = (height >> 2); row--;) {
 
  438         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
  439         src += (4 * src_stride);
 
  441         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
  442                    tmp0, tmp1, tmp2, tmp3);
 
  443         ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  444         DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
 
  451         PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
 
  453         ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
  454         dst += (4 * dst_stride);
 
  461                                  uint32_t coeff0, uint32_t coeff1,
 
  474                                  uint32_t coeff0, uint32_t coeff1,
 
  478     v16u8 
src0, 
src1, src2, src3, src4, out0, out1;
 
  479     v8u16 res0, res1, res2, res3;
 
  480     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  481     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  482     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  487     for (row = height >> 2; row--;) {
 
  488         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
  489         src += (4 * src_stride);
 
  491         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
  492                    src0, src1, src2, src3);
 
  493         DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 
  494                     coeff_vec, res0, res1, res2, res3);
 
  495         SLLI_4V(res0, res1, res2, res3, 3);
 
  500         ST8x4_UB(out0, out1, dst, dst_stride);
 
  502         dst += (4 * dst_stride);
 
  509                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  510                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  514     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  517     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  518     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  519     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  520     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  521     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  525     LD_UB3(src, src_stride, src0, src1, src2);
 
  526     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 
  527     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
  528     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  531     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  532     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  533     res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  535     out0 = __msa_copy_u_h(res_vert, 0);
 
  536     out1 = __msa_copy_u_h(res_vert, 1);
 
  545                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  546                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  549     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  550     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  553     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  554     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  555     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  556     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  557     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  561     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
  563     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 
  564     VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 
  565     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 
  566     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
  567     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  570     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  571     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  572     res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  579                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  580                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  582     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
  583     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  584     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  587     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  588     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  589     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  590     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  591     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  595     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
  596     src += (5 * src_stride);
 
  597     LD_UB4(src, src_stride, src5, src6, src7, src8);
 
  599     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 
  600     VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 
  601     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 
  602     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
 
  603     VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
 
  604     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
 
  605     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
  606     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  609     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  610     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  612     res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  615     dst += (4 * dst_stride);
 
  617     DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
  618     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  621     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  622     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  624     res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  631                                  uint32_t coef_hor0, uint32_t coef_hor1,
 
  632                                  uint32_t coef_ver0, uint32_t coef_ver1,
 
  637                               coef_hor1, coef_ver0, coef_ver1);
 
  638     } 
else if (4 == height) {
 
  640                               coef_hor1, coef_ver0, coef_ver1);
 
  641     } 
else if (8 == height) {
 
  643                               coef_hor1, coef_ver0, coef_ver1);
 
  649                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  650                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  653     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  656     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  657     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  658     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  659     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  660     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  663     LD_UB3(src, src_stride, src0, src1, src2);
 
  664     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 
  665     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
  666     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  669     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  670     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  671     res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  686     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 
  687     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
  689     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  690     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  691     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  692     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  693     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  701     for (row = (height >> 2); row--;) {
 
  702         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
  703         src += (4 * src_stride);
 
  705         VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 
  706         VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
 
  707         DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
 
  708                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 
  710         MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 
  711              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 
  713         ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 
  716         PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
 
  718         ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
  719         dst += (4 * dst_stride);
 
  726                                  uint32_t coef_hor0, uint32_t coef_hor1,
 
  727                                  uint32_t coef_ver0, uint32_t coef_ver1,
 
  732                               coef_hor1, coef_ver0, coef_ver1);
 
  735                                       coef_hor0, coef_hor1, coef_ver0,
 
  742                                  uint32_t coef_hor0, uint32_t coef_hor1,
 
  743                                  uint32_t coef_ver0, uint32_t coef_ver1,
 
  747     v16u8 
src0, 
src1, src2, src3, src4, out0, out1;
 
  748     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
 
  749     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
  751     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  752     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  753     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  754     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  755     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  762     src0 = (v16u8) __msa_vshf_b(mask, (v16i8) 
src0, (v16i8) src0);
 
  763     res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
 
  765     for (row = (height >> 2); row--;) {
 
  766         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
  767         src += (4 * src_stride);
 
  769         VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
 
  770         VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
 
  771         DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
 
  772                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 
  774         MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 
  775              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 
  778         res_vt0 += (res_hz0 * coeff_vt_vec1);
 
  779         res_vt1 += (res_hz1 * coeff_vt_vec1);
 
  780         res_vt2 += (res_hz2 * coeff_vt_vec1);
 
  781         res_vt3 += (res_hz3 * coeff_vt_vec1);
 
  783         SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
  784         SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
  785         PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 
  786         ST8x4_UB(out0, out1, dst, dst_stride);
 
  788         dst += (4 * dst_stride);
 
  796                                                uint32_t coeff0, uint32_t coeff1)
 
  799     uint32_t load0, load1;
 
  801     v16u8 dst_data = { 0 };
 
  805     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  806     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  807     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  811     LD_SB2(src, src_stride, src0, src1);
 
  814     load1 = 
LW(dst + dst_stride);
 
  818     src0 = __msa_vshf_b(mask, src1, src0);
 
  820     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 
  822     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  823     res_r = __msa_sat_u_h(res_r, 7);
 
  825     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  826     dst_data = __msa_aver_u_b(res, dst_data);
 
  828     out0 = __msa_copy_u_h((v8i16) dst_data, 0);
 
  829     out1 = __msa_copy_u_h((v8i16) dst_data, 2);
 
  838                                                uint32_t coeff0, uint32_t coeff1)
 
  841     v16u8 dst0, dst1, dst2, dst3;
 
  844     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  845     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  846     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  850     LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  851     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
  853     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 
  854     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 
  855     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 
  857     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
  859     src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
 
  861     res_r = __msa_dotp_u_h(src0, coeff_vec);
 
  863     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  864     res_r = __msa_sat_u_h(res_r, 7);
 
  866     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  867     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
  874                                                uint32_t coeff0, uint32_t coeff1)
 
  876     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  877     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
  878     v8u16 res0_r, res1_r;
 
  879     v16u8 res0, res1, 
mask;
 
  880     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  881     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  882     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  886     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
  887     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
  889     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 
  890     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 
  891     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 
  893     dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
 
  894     dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
 
  895     dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
 
  897     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
  898     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
 
  899     ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
 
  900     DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
 
  907     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
 
  911     dst += (4 * dst_stride);
 
  917                                               uint32_t coeff0, uint32_t coeff1,
 
  923     } 
else if (4 == height) {
 
  926     } 
else if (8 == height) {
 
  934                                                uint32_t coeff0, uint32_t coeff1)
 
  936     uint32_t load0, load1;
 
  938     v16u8 dst_data = { 0 };
 
  941     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  942     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  943     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  947     LD_SB2(src, src_stride, src0, src1);
 
  950     load1 = 
LW(dst + dst_stride);
 
  954     src0 = __msa_vshf_b(mask, src1, src0);
 
  956     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
 
  958     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  959     res_r = __msa_sat_u_h(res_r, 7);
 
  960     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  961     dst_data = __msa_aver_u_b((v16u8) res, dst_data);
 
  963     ST4x2_UB(dst_data, dst, dst_stride);
 
  974     uint32_t load0, load1;
 
  979     v8u16 res0_r, res1_r;
 
  980     v16u8 res0, res1, 
mask;
 
  981     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  982     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  983     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  987     for (row = (height >> 2); row--;) {
 
  988         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  989         src += (4 * src_stride);
 
  992         load1 = 
LW(dst + dst_stride);
 
  996         load0 = 
LW(dst + 2 * dst_stride);
 
  997         load1 = 
LW(dst + 3 * dst_stride);
 
 1001         VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
 
 1002         DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
 
 1009         PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
 
 1012         ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
 
 1013         dst += (4 * dst_stride);
 
 1019                                               uint32_t coeff0, uint32_t coeff1,
 
 1028                                                    coeff0, coeff1, height);
 
 1034                                               uint32_t coeff0, uint32_t coeff1,
 
 1038     v16u8 
src0, 
src1, src2, src3, out0, out1;
 
 1039     v8u16 res0, res1, res2, res3;
 
 1040     v16u8 dst0, dst1, dst2, dst3;
 
 1042     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1043     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1044     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1048     for (row = height >> 2; row--;) {
 
 1049         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 1050         src += (4 * src_stride);
 
 1051         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1052         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
 
 1053         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
 
 1054         DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 
 1055                     coeff_vec, res0, res1, res2, res3);
 
 1056         SLLI_4V(res0, res1, res2, res3, 3);
 
 1062         ST8x4_UB(out0, out1, dst, dst_stride);
 
 1063         dst += (4 * dst_stride);
 
 1069                                                uint32_t coeff0, uint32_t coeff1)
 
 1071     uint16_t out0, out1;
 
 1072     uint32_t load0, load1;
 
 1073     v16i8 
src0, 
src1, src2, tmp0, tmp1, res;
 
 1074     v16u8 dst_data = { 0 };
 
 1076     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1077     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1078     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1080     LD_SB3(src, src_stride, src0, src1, src2);
 
 1082     load1 = 
LW(dst + dst_stride);
 
 1086     ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 
 1088     tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
 1089     res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
 
 1091     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1092     res_r = __msa_sat_u_h(res_r, 7);
 
 1093     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1094     dst_data = __msa_aver_u_b((v16u8) res, dst_data);
 
 1095     out0 = __msa_copy_u_h((v8i16) dst_data, 0);
 
 1096     out1 = __msa_copy_u_h((v8i16) dst_data, 2);
 
 1105                                                uint32_t coeff0, uint32_t coeff1)
 
 1107     uint32_t load0, load1;
 
 1108     v16i8 
src0, 
src1, src2, src3, src4;
 
 1109     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1112     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1113     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1114     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1115     v16u8 dst_data = { 0 };
 
 1117     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 1120     load1 = 
LW(dst + dst_stride);
 
 1122     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
 
 1123     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
 
 1125     load0 = 
LW(dst + 2 * dst_stride);
 
 1126     load1 = 
LW(dst + 3 * dst_stride);
 
 1128     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
 
 1129     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
 
 1131     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 1132                tmp0, tmp1, tmp2, tmp3);
 
 1133     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1135     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
 1137     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
 1139     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1140     res_r = __msa_sat_u_h(res_r, 7);
 
 1142     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1143     res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
 1146     dst += (4 * dst_stride);
 
 1151                                                uint32_t coeff0, uint32_t coeff1)
 
 1153     uint32_t load0, load1, load2, load3;
 
 1154     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1155     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1158     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1159     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1160     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1161     v16u8 dst_data0 = { 0 };
 
 1162     v16u8 dst_data1 = { 0 };
 
 1164     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 1165     src += (5 * src_stride);
 
 1166     LD_SB4(src, src_stride, src5, src6, src7, src8);
 
 1168     LW4(dst, dst_stride, load0, load1, load2, load3);
 
 1170     dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
 
 1171     dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
 
 1172     dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
 
 1173     dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
 
 1175     LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
 
 1177     dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
 
 1178     dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
 
 1179     dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
 
 1180     dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
 
 1182     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 1183                tmp0, tmp1, tmp2, tmp3);
 
 1185     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1187     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
 1189     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
 1191     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1192     res_r = __msa_sat_u_h(res_r, 7);
 
 1194     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1195     res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
 
 1198     dst += (4 * dst_stride);
 
 1200     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
 1201                tmp0, tmp1, tmp2, tmp3);
 
 1203     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1205     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
 1207     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
 1209     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1210     res_r = __msa_sat_u_h(res_r, 7);
 
 1212     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1213     res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
 
 1220                                               uint32_t coeff0, uint32_t coeff1,
 
 1226     } 
else if (4 == height) {
 
 1229     } 
else if (8 == height) {
 
 1237                                                uint32_t coeff0, uint32_t coeff1)
 
 1239     uint32_t load0, load1;
 
 1240     v16i8 
src0, 
src1, src2, tmp0, tmp1;
 
 1241     v16u8 dst_data = { 0 };
 
 1244     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1245     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1246     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1248     LD_SB3(src, src_stride, src0, src1, src2);
 
 1251     load1 = 
LW(dst + dst_stride);
 
 1254     ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
 
 1256     tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
 1258     res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
 
 1260     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1261     res_r = __msa_sat_u_h(res_r, 7);
 
 1262     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1263     res = __msa_aver_u_b(res, dst_data);
 
 1276     uint32_t load0, load1, row;
 
 1277     v16i8 
src0, 
src1, src2, src3, src4;
 
 1278     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1281     v8u16 res0_r, res1_r;
 
 1283     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1284     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1285     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1290     for (row = (height >> 2); row--;) {
 
 1291         LD_SB4(src, src_stride, src1, src2, src3, src4);
 
 1292         src += (4 * src_stride);
 
 1295         load1 = 
LW(dst + dst_stride);
 
 1298         load0 = 
LW(dst + 2 * dst_stride);
 
 1299         load1 = 
LW(dst + 3 * dst_stride);
 
 1302         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 1303                    tmp0, tmp1, tmp2, tmp3);
 
 1304         ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1305         DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
 
 1312         PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
 
 1315         ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 1316         dst += (4 * dst_stride);
 
 1323                                               uint32_t coeff0, uint32_t coeff1,
 
 1331                                               coeff0, coeff1, height);
 
 1337                                               uint32_t coeff0, uint32_t coeff1,
 
 1341     v16u8 
src0, 
src1, src2, src3, src4;
 
 1343     v8u16 res0, res1, res2, res3;
 
 1344     v16u8 dst0, dst1, dst2, dst3;
 
 1345     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1346     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1347     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1352     for (row = height >> 2; row--;) {
 
 1353         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 1354         src += (4 * src_stride);
 
 1355         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1356         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 1357                    src0, src1, src2, src3);
 
 1358         DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
 
 1359                     coeff_vec, res0, res1, res2, res3);
 
 1360         SLLI_4V(res0, res1, res2, res3, 3);
 
 1366         ST8x4_UB(out0, out1, dst, dst_stride);
 
 1368         dst += (4 * dst_stride);
 
 1380     uint16_t out0, out1;
 
 1383     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1385     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1386     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1387     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1388     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1389     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1393     LD_UB3(src, src_stride, src0, src1, src2);
 
 1394     LD_UB2(dst, dst_stride, dst0, dst1);
 
 1395     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 
 1396     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
 1397     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1400     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1401     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1402     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1403     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 
 1404     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 1405     out0 = __msa_copy_u_h((v8i16) dst0, 0);
 
 1406     out1 = __msa_copy_u_h((v8i16) dst0, 1);
 
 1420     v16u8 
src0, 
src1, src2, src3, src4;
 
 1421     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1422     v16u8 dst0, dst1, dst2, dst3;
 
 1423     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1425     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1426     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1427     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1428     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1429     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1433     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
 1434     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1435     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 
 1436     VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 
 1437     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 
 1438     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
 1439     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1442     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1443     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1444     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1446     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 
 1447     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 
 1448     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 
 1449     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 1451     ST2x4_UB(dst0, 0, dst, dst_stride);
 
 1461     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1462     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1463     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1464     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1466     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1467     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1468     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1469     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1470     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1474     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
 1475     src += (5 * src_stride);
 
 1476     LD_UB4(src, src_stride, src5, src6, src7, src8);
 
 1478     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 1480     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
 
 1481     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
 
 1482     dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
 
 1484     dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
 
 1485     dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
 
 1486     dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
 
 1488     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
 
 1489     VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
 
 1490     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
 
 1491     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
 
 1492     VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
 
 1493     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
 
 1494     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
 1495     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1498     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1499     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1500     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1501     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 1503     ST2x4_UB(dst0, 0, dst, dst_stride);
 
 1504     dst += (4 * dst_stride);
 
 1506     DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
 1507     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1510     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1511     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1512     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1513     dst4 = __msa_aver_u_b((v16u8) res, dst4);
 
 1515     ST2x4_UB(dst4, 0, dst, dst_stride);
 
 1528                                            coef_hor0, coef_hor1,
 
 1529                                            coef_ver0, coef_ver1);
 
 1530     } 
else if (4 == height) {
 
 1532                                            coef_hor0, coef_hor1,
 
 1533                                            coef_ver0, coef_ver1);
 
 1534     } 
else if (8 == height) {
 
 1536                                            coef_hor0, coef_hor1,
 
 1537                                            coef_ver0, coef_ver1);
 
 1550     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1552     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1553     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1554     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1555     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1556     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1560     LD_UB3(src, src_stride, src0, src1, src2);
 
 1561     LD_UB2(dst, dst_stride, dst0, dst1);
 
 1562     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 
 1563     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
 
 1564     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1567     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1568     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1569     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1570     dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
 
 1571     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 1587     v16u8 
src0, 
src1, src2, src3, src4;
 
 1588     v16u8 dst0, dst1, dst2, dst3;
 
 1589     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 
 1590     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
 1592     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1593     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1594     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1595     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1596     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1604     for (row = (height >> 2); row--;) {
 
 1605         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 1606         src += (4 * src_stride);
 
 1608         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1610         VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
 
 1611         VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
 
 1612         DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
 
 1613                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 
 1615         MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 
 1616              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 
 1618         ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 
 1621         PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
 
 1623         dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
 
 1624         dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
 
 1628         ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
 
 1629         dst += (4 * dst_stride);
 
 1644                                            coef_hor0, coef_hor1,
 
 1645                                            coef_ver0, coef_ver1);
 
 1648                                               coef_hor0, coef_hor1,
 
 1649                                               coef_ver0, coef_ver1, height);
 
 1662     v16u8 
src0, 
src1, src2, src3, src4, out0, out1;
 
 1663     v8u16 res_hz0, res_hz1, res_hz2;
 
 1664     v8u16 res_hz3, res_hz4;
 
 1665     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
 1666     v16u8 dst0, dst1, dst2, dst3;
 
 1668     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1669     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1670     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1671     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1672     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1679     src0 = (v16u8) __msa_vshf_b(mask, (v16i8) 
src0, (v16i8) src0);
 
 1680     res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
 
 1682     for (row = (height >> 2); row--;) {
 
 1683         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 1684         src += (4 * src_stride);
 
 1686         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1687         VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
 
 1688         VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
 
 1689         DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
 
 1690                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 
 1692         MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 
 1693              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 
 1696         res_vt0 += (res_hz0 * coeff_vt_vec1);
 
 1697         res_vt1 += (res_hz1 * coeff_vt_vec1);
 
 1698         res_vt2 += (res_hz2 * coeff_vt_vec1);
 
 1699         res_vt3 += (res_hz3 * coeff_vt_vec1);
 
 1701         SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
 1702         SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
 1704         PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 
 1707         ST8x4_UB(out0, out1, dst, dst_stride);
 
 1708         dst += (4 * dst_stride);
 
 1719     uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
 
 1720     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 1722     if (0 == height % 12) {
 
 1723         for (cnt = (height / 12); cnt--;) {
 
 1725                    src0, src1, src2, src3, src4, src5, src6, src7);
 
 1726             src += (8 * src_stride);
 
 1728             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 1729             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 1730             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 1731             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 1732             out4 = __msa_copy_u_d((v2i64) src4, 0);
 
 1733             out5 = __msa_copy_u_d((v2i64) src5, 0);
 
 1734             out6 = __msa_copy_u_d((v2i64) src6, 0);
 
 1735             out7 = __msa_copy_u_d((v2i64) src7, 0);
 
 1737             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 1738             dst += (4 * dst_stride);
 
 1739             SD4(out4, out5, out6, out7, dst, dst_stride);
 
 1740             dst += (4 * dst_stride);
 
 1742             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 1743             src += (4 * src_stride);
 
 1745             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 1746             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 1747             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 1748             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 1750             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 1751             dst += (4 * dst_stride);
 
 1753     } 
else if (0 == height % 8) {
 
 1754         for (cnt = height >> 3; cnt--;) {
 
 1756                    src0, src1, src2, src3, src4, src5, src6, src7);
 
 1757             src += (8 * src_stride);
 
 1759             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 1760             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 1761             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 1762             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 1763             out4 = __msa_copy_u_d((v2i64) src4, 0);
 
 1764             out5 = __msa_copy_u_d((v2i64) src5, 0);
 
 1765             out6 = __msa_copy_u_d((v2i64) src6, 0);
 
 1766             out7 = __msa_copy_u_d((v2i64) src7, 0);
 
 1768             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 1769             dst += (4 * dst_stride);
 
 1770             SD4(out4, out5, out6, out7, dst, dst_stride);
 
 1771             dst += (4 * dst_stride);
 
 1773     } 
else if (0 == height % 4) {
 
 1774         for (cnt = (height / 4); cnt--;) {
 
 1775             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 1776             src += (4 * src_stride);
 
 1777             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 1778             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 1779             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 1780             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 1782             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 1783             dst += (4 * dst_stride);
 
 1785     } 
else if (0 == height % 2) {
 
 1786         for (cnt = (height / 2); cnt--;) {
 
 1787             LD_UB2(src, src_stride, src0, src1);
 
 1788             src += (2 * src_stride);
 
 1789             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 1790             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 1805     uint32_t out0, out1, out2, out3;
 
 1807     v16u8 dst0, dst1, dst2, dst3;
 
 1809     if (0 == (height % 4)) {
 
 1810         for (cnt = (height / 4); cnt--;) {
 
 1811             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 1812             src += (4 * src_stride);
 
 1814             LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1816             AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 1817                         dst0, dst1, dst2, dst3);
 
 1819             out0 = __msa_copy_u_w((v4i32) dst0, 0);
 
 1820             out1 = __msa_copy_u_w((v4i32) dst1, 0);
 
 1821             out2 = __msa_copy_u_w((v4i32) dst2, 0);
 
 1822             out3 = __msa_copy_u_w((v4i32) dst3, 0);
 
 1823             SW4(out0, out1, out2, out3, dst, dst_stride);
 
 1824             dst += (4 * dst_stride);
 
 1826     } 
else if (0 == (height % 2)) {
 
 1827         for (cnt = (height / 2); cnt--;) {
 
 1828             LD_UB2(src, src_stride, src0, src1);
 
 1829             src += (2 * src_stride);
 
 1831             LD_UB2(dst, dst_stride, dst0, dst1);
 
 1835             out0 = __msa_copy_u_w((v4i32) dst0, 0);
 
 1836             out1 = __msa_copy_u_w((v4i32) dst1, 0);
 
 1850     uint64_t out0, out1, out2, out3;
 
 1852     v16u8 dst0, dst1, dst2, dst3;
 
 1854     for (cnt = (height / 4); cnt--;) {
 
 1855         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 1856         src += (4 * src_stride);
 
 1857         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1859         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 1860                     dst0, dst1, dst2, dst3);
 
 1862         out0 = __msa_copy_u_d((v2i64) dst0, 0);
 
 1863         out1 = __msa_copy_u_d((v2i64) dst1, 0);
 
 1864         out2 = __msa_copy_u_d((v2i64) dst2, 0);
 
 1865         out3 = __msa_copy_u_d((v2i64) dst3, 0);
 
 1866         SD4(out0, out1, out2, out3, dst, dst_stride);
 
 1867         dst += (4 * dst_stride);
 
 1874     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1878                              stride, x, (8 - x), y, (8 - y), height);
 
 1893     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1897                              stride, x, (8 - x), y, (8 - y), height);
 
 1903         for (cnt = height; cnt--;) {
 
 1904             *((uint32_t *) dst) = *((uint32_t *) src);
 
 1917     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1921                              stride, x, (8 - x), y, (8 - y), height);
 
 1927         for (cnt = height; cnt--;) {
 
 1928             *((uint16_t *) dst) = *((uint16_t *) src);
 
 1939     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1944                                           stride, x, (8 - x), y,
 
 1948                                           stride, x, (8 - x), height);
 
 1951                                           stride, y, (8 - y), height);
 
 1960     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1964                                           stride, x, (8 - x), y,
 
 1968                                           stride, x, (8 - x), height);
 
 1971                                           stride, y, (8 - y), height);
 
 1982     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1986                                           stride, x, (8 - x), y,
 
 1990                                           stride, x, (8 - x), height);
 
 1993                                           stride, y, (8 - y), height);
 
 1995         for (cnt = height; cnt--;) {
 
 1996             dst[0] = (dst[0] + src[0] + 1) >> 1;
 
 1997             dst[1] = (dst[1] + src[1] + 1) >> 1;
 
static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
#define MUL2(in0, in1, in2, in3, out0, out1)
 
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride, int height, int x, int y)
 
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
 
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride, int height, int x, int y)
 
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code. 
 
static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
 
static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
#define INSERT_W2_UB(...)
 
static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static const uint16_t mask[17]
 
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
#define SW4(in0, in1, in2, in3, pdst, stride)
 
static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
#define ST2x4_UB(in, stidx, pdst, stride)
 
static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static const uint8_t chroma_mask_arr[16 *5]
 
static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
 
static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
BYTE int const BYTE int int int height
 
static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride, int height, int x, int y)
 
static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
#define ADD2(in0, in1, in2, in3, out0, out1)
 
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
#define SD4(in0, in1, in2, in3, pdst, stride)
 
static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
#define SLLI_4V(in0, in1, in2, in3, shift)
 
static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
GLint GLenum GLboolean GLsizei stride
 
#define LW4(psrc, stride, out0, out1, out2, out3)
 
#define ST8x4_UB(in0, in1, pdst, stride)
 
static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int x, int y)
 
static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride, int height, int x, int y)
 
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
 
static void avg_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
 
static void avg_width4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
 
static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
 
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
 
#define ST8x1_UB(in, pdst)
 
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int x, int y)
 
#define ST4x2_UB(in, pdst, stride)
 
static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
 
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)