27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   29     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
 
   32 #define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,       \ 
   33                                out0, out1, out2, out3)                     \ 
   35     MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3);  \ 
   36     SRAR_W4_SW(out0, out1, out2, out3, rnd);                               \ 
   37     ADD4(out0, offset, out1, offset, out2, offset, out3, offset,           \ 
   38          out0, out1, out2, out3);                                          \ 
   39     out0 = CLIP_SW_0_255(out0);                                            \ 
   40     out1 = CLIP_SW_0_255(out1);                                            \ 
   41     out2 = CLIP_SW_0_255(out2);                                            \ 
   42     out3 = CLIP_SW_0_255(out3);                                            \ 
   45 #define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,              \ 
   46                             out0_r, out1_r, out0_l, out1_l)          \ 
   48     ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r);                  \ 
   49     ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l);                  \ 
   50     DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt,  \ 
   51                 out0_r, out1_r, out0_l, out1_l);                     \ 
   52     SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \ 
   53     ADD4(out0_r, offset, out1_r, offset,                             \ 
   54          out0_l, offset, out1_l, offset,                             \ 
   55          out0_r, out1_r, out0_l, out1_l);                            \ 
   56     out0_r = CLIP_SW_0_255(out0_r);                                  \ 
   57     out1_r = CLIP_SW_0_255(out1_r);                                  \ 
   58     out0_l = CLIP_SW_0_255(out0_l);                                  \ 
   59     out1_l = CLIP_SW_0_255(out1_l);                                  \ 
   62 #define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,  \ 
   63                             out0_r, out1_r, out2_r, out3_r,        \ 
   64                             out0_l, out1_l, out2_l, out3_l)        \ 
   66     HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,                \ 
   67                         out0_r, out1_r, out0_l, out1_l);           \ 
   68     HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd,                \ 
   69                         out2_r, out3_r, out2_l, out3_l);           \ 
   72 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \ 
   75     v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m;                                 \ 
   77     ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m);                              \ 
   78     ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m);                              \ 
   79     DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w,      \ 
   80                 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m);                   \ 
   81     SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \ 
   82     PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \ 
   83     ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \ 
   84     CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h);                                  \ 
   87 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \ 
   88                                        offset_h, rnd_w, out0_h, out1_h,    \ 
   91     HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,   \ 
   93     HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w,   \ 
  106     uint32_t loop_cnt, tp0, tp1, tp2, tp3;
 
  110     v8i16 dst0, dst1, dst2, dst3, offset_vec;
 
  111     v4i32 weight_vec, rnd_vec;
 
  113     weight = weight & 0x0000FFFF;
 
  114     weight_vec = __msa_fill_w(weight);
 
  115     offset_vec = __msa_fill_h(offset);
 
  116     rnd_vec = __msa_fill_w(rnd_val);
 
  119         v4i32 dst0_r, dst0_l;
 
  121         LW2(src, src_stride, tp0, tp1);
 
  123         dst0 = (v8i16) __msa_ilvr_b(zero, src0);
 
  127         DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 
  129         dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
  132         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
 
  134     } 
else if (4 == height) {
 
  135         LW4(src, src_stride, tp0, tp1, tp2, tp3);
 
  140                                        rnd_vec, dst0, dst1);
 
  141         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
  142         ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
 
  143     } 
else if (0 == (height % 8)) {
 
  144         for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  145             LW4(src, src_stride, tp0, tp1, tp2, tp3);
 
  146             src += 4 * src_stride;
 
  148             LW4(src, src_stride, tp0, tp1, tp2, tp3);
 
  149             src += 4 * src_stride;
 
  153             SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  155                                            offset_vec, rnd_vec, dst0, dst1,
 
  158             ST4x8_UB(out0, out1, dst, dst_stride);
 
  159             dst += 8 * dst_stride;
 
  174     uint64_t tp0, tp1, tp2, tp3;
 
  176     v16u8 out0, out1, out2, out3;
 
  178     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  179     v4i32 weight_vec, rnd_vec;
 
  181     weight = weight & 0x0000FFFF;
 
  182     weight_vec = __msa_fill_w(weight);
 
  183     offset_vec = __msa_fill_h(offset);
 
  184     rnd_vec = __msa_fill_w(rnd_val);
 
  186     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  187         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  188         src += (4 * src_stride);
 
  191         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  192         src += (4 * src_stride);
 
  201         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  202         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  205                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  208                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  213         ST6x4_UB(out0, out1, dst, dst_stride);
 
  214         dst += (4 * dst_stride);
 
  215         ST6x4_UB(out2, out3, dst, dst_stride);
 
  216         dst += (4 * dst_stride);
 
  230     uint64_t tp0, tp1, tp2, tp3;
 
  231     v16i8 
src0 = { 0 }, 
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
 
  233     v16u8 out0, out1, out2, out3;
 
  234     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  235     v4i32 weight_vec, rnd_vec;
 
  237     weight = weight & 0x0000FFFF;
 
  238     weight_vec = __msa_fill_w(weight);
 
  239     offset_vec = __msa_fill_h(offset);
 
  240     rnd_vec = __msa_fill_w(rnd_val);
 
  243         LD2(src, src_stride, tp0, tp1);
 
  248                                        rnd_vec, dst0, dst1);
 
  249         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
  251     } 
else if (4 == height) {
 
  252         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  257         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  259                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  262         ST8x4_UB(out0, out1, dst, dst_stride);
 
  263     } 
else if (6 == height) {
 
  264         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  265         src += 4 * src_stride;
 
  268         LD2(src, src_stride, tp0, tp1);
 
  273         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  276                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  279                                        rnd_vec, dst4, dst5);
 
  280         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  281         ST8x4_UB(out0, out1, dst, dst_stride);
 
  282         dst += (4 * dst_stride);
 
  284     } 
else if (0 == height % 8) {
 
  285         for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  286             LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  287             src += 4 * src_stride;
 
  290             LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  291             src += 4 * src_stride;
 
  299             SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  300             SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  302                                            offset_vec, rnd_vec, dst0, dst1,
 
  305                                            offset_vec, rnd_vec, dst4, dst5,
 
  309             ST8x4_UB(out0, out1, dst, dst_stride);
 
  310             dst += (4 * dst_stride);
 
  311             ST8x4_UB(out2, out3, dst, dst_stride);
 
  312             dst += (4 * dst_stride);
 
  327     v16u8 out0, out1, out2;
 
  329     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
  332     v4i32 weight_vec, rnd_vec;
 
  334     weight = weight & 0x0000FFFF;
 
  335     weight_vec = __msa_fill_w(weight);
 
  336     offset_vec = __msa_fill_h(offset);
 
  337     rnd_vec = __msa_fill_w(rnd_val);
 
  339     for (loop_cnt = 4; loop_cnt--;) {
 
  340         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  341         src += (4 * src_stride);
 
  342         ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
 
  343                    dst0, dst1, dst2, dst3);
 
  345         ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
 
  346         ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
 
  347         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  350                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  353                                        rnd_vec, dst4, dst5);
 
  355         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  356         ST12x4_UB(out0, out1, out2, dst, dst_stride);
 
  357         dst += (4 * dst_stride);
 
  371     v16u8 out0, out1, out2, out3;
 
  374     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  375     v4i32 weight_vec, rnd_vec;
 
  377     weight = weight & 0x0000FFFF;
 
  378     weight_vec = __msa_fill_w(weight);
 
  379     offset_vec = __msa_fill_h(offset);
 
  380     rnd_vec = __msa_fill_w(rnd_val);
 
  382     for (loop_cnt = height >> 2; loop_cnt--;) {
 
  383         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  384         src += (4 * src_stride);
 
  389         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  390         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  392                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  395                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  399         ST_UB4(out0, out1, out2, out3, dst, dst_stride);
 
  400         dst += (4 * dst_stride);
 
  414     v16u8 out0, out1, out2, out3, out4, out5;
 
  415     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  417     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  418     v8i16 dst8, dst9, dst10, dst11;
 
  419     v4i32 weight_vec, rnd_vec;
 
  421     weight = weight & 0x0000FFFF;
 
  422     weight_vec = __msa_fill_w(weight);
 
  423     offset_vec = __msa_fill_h(offset);
 
  424     rnd_vec = __msa_fill_w(rnd_val);
 
  426     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  427         LD_SB4(src, src_stride, src0, src1, src4, src5);
 
  428         LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
 
  429         src += (4 * src_stride);
 
  433         ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
 
  436         ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
 
  437         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  438         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  439         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  441                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  444                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  447                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
  449         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  450         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
 
  451         ST_UB4(out0, out1, out3, out4, dst, dst_stride);
 
  452         ST8x4_UB(out2, out5, dst + 16, dst_stride);
 
  453         dst += (4 * dst_stride);
 
  467     v16u8 out0, out1, out2, out3;
 
  470     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  471     v4i32 weight_vec, rnd_vec;
 
  473     weight = weight & 0x0000FFFF;
 
  474     weight_vec = __msa_fill_w(weight);
 
  475     offset_vec = __msa_fill_h(offset);
 
  476     rnd_vec = __msa_fill_w(rnd_val);
 
  478     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  479         LD_SB2(src, src_stride, src0, src1);
 
  480         LD_SB2(src + 16, src_stride, src2, src3);
 
  481         src += (2 * src_stride);
 
  487         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  488         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  490                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  493                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  497         ST_UB2(out0, out1, dst, dst_stride);
 
  498         ST_UB2(out2, out3, dst + 16, dst_stride);
 
  499         dst += (2 * dst_stride);
 
  513     v16u8 out0, out1, out2, out3, out4, out5;
 
  514     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
  516     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
 
  517     v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
 
  518     v4i32 weight_vec, rnd_vec;
 
  520     weight = weight & 0x0000FFFF;
 
  521     weight_vec = __msa_fill_w(weight);
 
  522     offset_vec = __msa_fill_h(offset);
 
  523     rnd_vec = __msa_fill_w(rnd_val);
 
  525     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  526         LD_SB3(src, 16, src0, src1, src2);
 
  528         LD_SB3(src, 16, src3, src4, src5);
 
  537         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  538         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  539         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  541                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  544                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  547                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
  549         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  550         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
 
  551         ST_UB2(out0, out1, dst, 16);
 
  552         ST_UB(out2, dst + 32);
 
  554         ST_UB2(out3, out4, dst, 16);
 
  555         ST_UB(out5, dst + 32);
 
  570     v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
 
  571     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  573     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  574     v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
  575     v4i32 weight_vec, rnd_vec;
 
  577     weight = weight & 0x0000FFFF;
 
  578     weight_vec = __msa_fill_w(weight);
 
  579     offset_vec = __msa_fill_h(offset);
 
  580     rnd_vec = __msa_fill_w(rnd_val);
 
  582     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  583         LD_SB4(src, 16, src0, src1, src2, src3);
 
  585         LD_SB4(src, 16, src4, src5, src6, src7);
 
  596         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  597         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  598         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  599         SLLI_4V(dst12, dst13, dst14, dst15, 6);
 
  601                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  604                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  607                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
  610                                        offset_vec, rnd_vec, dst12, dst13, dst14,
 
  615         PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
 
  616         ST_UB4(out0, out1, out2, out3, dst, 16);
 
  618         ST_UB4(out4, out5, out6, out7, dst, 16);
 
  635     v8i16 filt0, filt1, filt2, filt3;
 
  636     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  637     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  638     v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
 
  639     v8i16 filter_vec, dst01, dst23, dst45, dst67;
 
  640     v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
 
  641     v4i32 weight_vec, rnd_vec;
 
  644     weight = weight & 0x0000FFFF;
 
  646     weight_vec = __msa_fill_w(weight);
 
  647     rnd_vec = __msa_fill_w(rnd_val);
 
  652     weight_vec_h = __msa_fill_h(weight);
 
  653     offset_vec = __msa_fill_h(offset);
 
  654     denom_vec = __msa_fill_h(rnd_val);
 
  656     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  657     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  659     filter_vec = 
LD_SH(filter);
 
  660     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  667     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  668         LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
  669         src += (8 * src_stride);
 
  672         VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
 
  673                    vec0, vec1, vec2, vec3);
 
  674         VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
 
  675                    vec4, vec5, vec6, vec7);
 
  676         VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
 
  677                    vec8, vec9, vec10, vec11);
 
  678         VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
 
  679                    vec12, vec13, vec14, vec15);
 
  690                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  694         ST4x8_UB(out0, out1, dst, dst_stride);
 
  695         dst += (8 * dst_stride);
 
  712     v8i16 filt0, filt1, filt2, filt3;
 
  713     v16i8 mask0, mask1, mask2, mask3;
 
  715     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  716     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  717     v8i16 dst0, dst1, dst2, dst3;
 
  718     v8i16 weight_vec_h, offset_vec, denom_vec;
 
  719     v4i32 weight_vec, rnd_vec;
 
  722     weight = weight & 0x0000FFFF;
 
  724     weight_vec = __msa_fill_w(weight);
 
  725     rnd_vec = __msa_fill_w(rnd_val);
 
  730     weight_vec_h = __msa_fill_h(weight);
 
  731     offset_vec = __msa_fill_h(offset);
 
  732     denom_vec = __msa_fill_h(rnd_val);
 
  734     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  735     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  737     filter_vec = 
LD_SH(filter);
 
  738     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  745     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  746         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  747         src += (4 * src_stride);
 
  750         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  751                    vec0, vec1, vec2, vec3);
 
  752         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  753                    vec4, vec5, vec6, vec7);
 
  754         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  755                    vec8, vec9, vec10, vec11);
 
  756         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  757                    vec12, vec13, vec14, vec15);
 
  768                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  772         ST8x4_UB(out0, out1, dst, dst_stride);
 
  773         dst += (4 * dst_stride);
 
  788     v16u8 out0, out1, out2;
 
  789     v8i16 filt0, filt1, filt2, filt3;
 
  790     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  791     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
  792     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  793     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  795     v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
 
  796     v8i16 weight_vec_h, offset_vec, denom_vec;
 
  797     v4i32 weight_vec, rnd_vec;
 
  800     weight = weight & 0x0000FFFF;
 
  802     weight_vec = __msa_fill_w(weight);
 
  803     rnd_vec = __msa_fill_w(rnd_val);
 
  808     weight_vec_h = __msa_fill_h(weight);
 
  809     offset_vec = __msa_fill_h(offset);
 
  810     denom_vec = __msa_fill_h(rnd_val);
 
  812     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  813     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  815     filter_vec = 
LD_SH(filter);
 
  816     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  827     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  828         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  829         LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
 
  830         src += (4 * src_stride);
 
  833         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  834                    vec0, vec1, vec2, vec3);
 
  835         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  836                    vec4, vec5, vec6, vec7);
 
  837         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  838                    vec8, vec9, vec10, vec11);
 
  839         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  840                    vec12, vec13, vec14, vec15);
 
  849         VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
 
  850                    vec0, vec1, vec2, vec3);
 
  851         VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
 
  852                    vec4, vec5, vec6, vec7);
 
  859                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  862                                        rnd_vec, dst4, dst5);
 
  864         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  865         ST8x4_UB(out0, out1, dst, dst_stride);
 
  866         ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
 
  867         dst += (4 * dst_stride);
 
  884     v8i16 filt0, filt1, filt2, filt3;
 
  885     v16i8 mask0, mask1, mask2, mask3;
 
  887     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  888     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  889     v8i16 dst0, dst1, dst2, dst3;
 
  890     v8i16 weight_vec_h, offset_vec, denom_vec;
 
  891     v4i32 weight_vec, rnd_vec;
 
  895     weight_vec = __msa_fill_w(weight);
 
  896     rnd_vec = __msa_fill_w(rnd_val);
 
  901     weight_vec_h = __msa_fill_h(weight);
 
  902     offset_vec = __msa_fill_h(offset);
 
  903     denom_vec = __msa_fill_h(rnd_val);
 
  905     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  906     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  908     filter_vec = 
LD_SH(filter);
 
  909     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  916     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  917         LD_SB2(src, src_stride, src0, src2);
 
  918         LD_SB2(src + 8, src_stride, src1, src3);
 
  919         src += (2 * src_stride);
 
  922         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  923                    vec0, vec1, vec2, vec3);
 
  924         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  925                    vec4, vec5, vec6, vec7);
 
  926         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  927                    vec8, vec9, vec10, vec11);
 
  928         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  929                    vec12, vec13, vec14, vec15);
 
  940                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  944         ST_UB2(out0, out1, dst, dst_stride);
 
  945         dst += (2 * dst_stride);
 
  960     v16u8 out0, out1, out2;
 
  962     v8i16 filt0, filt1, filt2, filt3;
 
  963     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
  964     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  965     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  966     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
  967     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
  968     v4i32 weight_vec, rnd_vec;
 
  972     weight_vec = __msa_fill_w(weight);
 
  973     rnd_vec = __msa_fill_w(rnd_val);
 
  978     weight_vec_h = __msa_fill_h(weight);
 
  979     offset_vec = __msa_fill_h(offset);
 
  980     denom_vec = __msa_fill_h(rnd_val);
 
  982     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  983     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  985     filter_vec = 
LD_SH(filter);
 
  986     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  997     for (loop_cnt = 16; loop_cnt--;) {
 
  998         LD_SB2(src, 16, src0, src1);
 
 1000         LD_SB2(src, 16, src2, src3);
 
 1003         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1004                    vec0, vec1, vec2, vec3);
 
 1005         VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
 
 1006                    vec4, vec5, vec6, vec7);
 
 1007         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1008                    vec8, vec9, vec10, vec11);
 
 1009         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1010                    vec12, vec13, vec14, vec15);
 
 1020         VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
 
 1021                    vec0, vec1, vec2, vec3);
 
 1022         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1023                    vec4, vec5, vec6, vec7);
 
 1030                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1033                                        rnd_vec, dst4, dst5);
 
 1035         PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
 
 1036         ST_UB2(out0, out1, dst, dst_stride);
 
 1037         ST8x2_UB(out2, dst + 16, dst_stride);
 
 1038         dst += (2 * dst_stride);
 
 1053     v16u8 out0, out1, out2, out3;
 
 1054     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 1055     v8i16 filt0, filt1, filt2, filt3;
 
 1056     v16i8 mask0, mask1, mask2, mask3;
 
 1057     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1058     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1060     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1061     v8i16 weight_vec_h, offset_vec, denom_vec;
 
 1062     v4i32 weight_vec, rnd_vec;
 
 1066     weight_vec = __msa_fill_w(weight);
 
 1067     rnd_vec = __msa_fill_w(rnd_val);
 
 1072     weight_vec_h = __msa_fill_h(weight);
 
 1073     offset_vec = __msa_fill_h(offset);
 
 1074     denom_vec = __msa_fill_h(rnd_val);
 
 1076     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1077     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1079     filter_vec = 
LD_SH(filter);
 
 1080     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1087     for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1088         LD_SB4(src, 8, src0, src1, src2, src3);
 
 1090         LD_SB4(src, 8, src4, src5, src6, src7);
 
 1094         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1095                    vec0, vec1, vec2, vec3);
 
 1096         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1097                    vec4, vec5, vec6, vec7);
 
 1098         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1099                    vec8, vec9, vec10, vec11);
 
 1100         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1101                    vec12, vec13, vec14, vec15);
 
 1111         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 1112                    vec0, vec1, vec2, vec3);
 
 1113         VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
 
 1114                    vec4, vec5, vec6, vec7);
 
 1115         VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
 
 1116                    vec8, vec9, vec10, vec11);
 
 1117         VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
 
 1118                    vec12, vec13, vec14, vec15);
 
 1129                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1132                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
 1137         ST_UB2(out0, out1, dst, 16);
 
 1139         ST_UB2(out2, out3, dst, 16);
 
 1155     v16u8 out0, out1, out2;
 
 1157     v8i16 filt0, filt1, filt2, filt3;
 
 1158     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1159     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1160     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1161     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 1162     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 1163     v4i32 weight_vec, rnd_vec;
 
 1167     weight = weight & 0x0000FFFF;
 
 1168     weight_vec = __msa_fill_w(weight);
 
 1169     rnd_vec = __msa_fill_w(rnd_val);
 
 1174     weight_vec_h = __msa_fill_h(weight);
 
 1175     offset_vec = __msa_fill_h(offset);
 
 1176     denom_vec = __msa_fill_h(rnd_val);
 
 1178     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1179     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1181     filter_vec = 
LD_SH(filter);
 
 1182     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1193     for (loop_cnt = 64; loop_cnt--;) {
 
 1194         LD_SB3(src, 16, src0, src1, src2);
 
 1195         src3 = 
LD_SB(src + 40);
 
 1199         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1200                    vec0, vec1, vec2, vec3);
 
 1201         VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
 
 1202                    vec4, vec5, vec6, vec7);
 
 1203         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1204                    vec8, vec9, vec10, vec11);
 
 1205         VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
 
 1206                    vec12, vec13, vec14, vec15);
 
 1216         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1217                    vec0, vec1, vec2, vec3);
 
 1218         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1219                    vec4, vec5, vec6, vec7);
 
 1226                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1229                                        rnd_vec, dst4, dst5);
 
 1231         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 1232         ST_UB2(out0, out1, dst, 16);
 
 1233         ST_UB(out2, dst + 32);
 
 1250     uint32_t loop_cnt, cnt;
 
 1253     v8i16 filt0, filt1, filt2, filt3;
 
 1254     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1255     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1256     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1257     v8i16 dst0, dst1, dst2, dst3;
 
 1258     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 1259     v4i32 weight_vec, rnd_vec;
 
 1263     weight_vec = __msa_fill_w(weight);
 
 1264     rnd_vec = __msa_fill_w(rnd_val);
 
 1269     weight_vec_h = __msa_fill_h(weight);
 
 1270     offset_vec = __msa_fill_h(offset);
 
 1271     denom_vec = __msa_fill_h(rnd_val);
 
 1273     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1274     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1276     filter_vec = 
LD_SH(filter);
 
 1277     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1288     for (loop_cnt = height; loop_cnt--;) {
 
 1292         for (cnt = 2; cnt--;) {
 
 1293             LD_SB2(src_tmp, 16, src0, src1);
 
 1294             src2 = 
LD_SB(src_tmp + 24);
 
 1298             VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1299                        vec0, vec1, vec2, vec3);
 
 1300             VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
 
 1301                        vec4, vec5, vec6, vec7);
 
 1302             VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1303                        vec8, vec9, vec10, vec11);
 
 1304             VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1305                        vec12, vec13, vec14, vec15);
 
 1316                                            offset_vec, rnd_vec, dst0, dst1,
 
 1320             ST_UB2(out0, out1, dst_tmp, 16);
 
 1340     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1341     v16i8 src9, src10, src11, src12, src13, src14;
 
 1342     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1343     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1344     v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
 
 1345     v16i8 src2110, src4332, src6554, src8776, src10998;
 
 1346     v16i8 src12111110, src14131312;
 
 1347     v8i16 dst10, dst32, dst54, dst76;
 
 1348     v8i16 filt0, filt1, filt2, filt3;
 
 1349     v8i16 filter_vec, const_vec;
 
 1350     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 1351     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1353     src -= (3 * src_stride);
 
 1354     const_vec = __msa_ldi_h(128);
 
 1357     weight = weight & 0x0000FFFF;
 
 1358     weight_vec = __msa_fill_w(weight);
 
 1359     offset_vec = __msa_fill_w(offset);
 
 1360     rnd_vec = __msa_fill_w(rnd_val);
 
 1362     filter_vec = 
LD_SH(filter);
 
 1363     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1365     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1366     src += (7 * src_stride);
 
 1368     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1369                src10_r, src32_r, src54_r, src21_r);
 
 1371     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1374                src32_r, src65_r, src54_r, src2110, src4332, src6554);
 
 1378     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 1380                src7, src8, src9, src10, src11, src12, src13, src14);
 
 1381         src += (8 * src_stride);
 
 1382         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1383                    src76_r, src87_r, src98_r, src109_r);
 
 1384         ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
 
 1385                    src1110_r, src1211_r, src1312_r, src1413_r);
 
 1386         ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
 
 1387                    src1413_r, src1312_r,
 
 1388                    src8776, src10998, src12111110, src14131312);
 
 1392         DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
 
 1393                      filt2, filt3, dst10, dst10, dst10, dst10);
 
 1396                      filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
 
 1399                      filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
 
 1401         DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
 
 1402                      filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
 
 1405                             weight_vec, offset_vec, rnd_vec,
 
 1406                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 1407                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 1410                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 1411         ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
 
 1412         dst += (8 * dst_stride);
 
 1415         src4332 = src12111110;
 
 1416         src6554 = src14131312;
 
 1432     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1433     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1434     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1435     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 1436     v8i16 filt0, filt1, filt2, filt3;
 
 1437     v8i16 filter_vec, const_vec;
 
 1438     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 1439     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1441     src -= (3 * src_stride);
 
 1442     const_vec = __msa_ldi_h(128);
 
 1445     weight = weight & 0x0000FFFF;
 
 1446     weight_vec = __msa_fill_w(weight);
 
 1447     offset_vec = __msa_fill_w(offset);
 
 1448     rnd_vec = __msa_fill_w(rnd_val);
 
 1450     filter_vec = 
LD_SH(filter);
 
 1451     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1453     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1454     src += (7 * src_stride);
 
 1457     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1458                src10_r, src32_r, src54_r, src21_r);
 
 1459     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1461     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1462         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1463         src += (4 * src_stride);
 
 1465         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1466                    src76_r, src87_r, src98_r, src109_r);
 
 1470                      filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
 
 1473                      filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
 
 1476                      filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
 
 1479                      filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
 
 1482                             weight_vec, offset_vec, rnd_vec,
 
 1483                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 1484                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 1487                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 1488         ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 1489         dst += (4 * dst_stride);
 
 1512     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1513     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1514     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1515     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 1516     v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
 
 1517     v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
 
 1518     v16i8 src2110, src4332, src6554, src8776, src10998;
 
 1519     v8i16 filt0, filt1, filt2, filt3;
 
 1520     v8i16 filter_vec, const_vec;
 
 1521     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
 
 1522     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
 
 1523     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1525     src -= (3 * src_stride);
 
 1526     const_vec = __msa_ldi_h(128);
 
 1529     weight = weight & 0x0000FFFF;
 
 1530     weight_vec = __msa_fill_w(weight);
 
 1531     offset_vec = __msa_fill_w(offset);
 
 1532     rnd_vec = __msa_fill_w(rnd_val);
 
 1534     filter_vec = 
LD_SH(filter);
 
 1535     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1537     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1538     src += (7 * src_stride);
 
 1541     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1542                src10_r, src32_r, src54_r, src21_r);
 
 1543     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1544     ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1545                src10_l, src32_l, src54_l, src21_l);
 
 1546     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1547     ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
 
 1548                src2110, src4332, src6554);
 
 1550     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1551         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1552         src += (4 * src_stride);
 
 1555         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1556                    src76_r, src87_r, src98_r, src109_r);
 
 1557         ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1558                    src76_l, src87_l, src98_l, src109_l);
 
 1559         ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
 
 1563                      filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
 
 1566                      filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
 
 1569                      filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
 
 1572                      filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
 
 1575                      filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4);
 
 1578                      filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5);
 
 1581                             weight_vec, offset_vec, rnd_vec,
 
 1582                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 1583                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 1585                             dst4_r, dst5_r, dst4_l, dst5_l);
 
 1588                          dst2_l, dst2_r, dst3_l, dst3_r,
 
 1589                          dst4_l, dst4_r, dst5_l, dst5_r,
 
 1590                          dst0_r, dst1_r, dst2_r);
 
 1591         ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
 
 1592         dst += (4 * dst_stride);
 
 1621     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1622     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 1623     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 1624     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 1625     v16i8 src10_l, src32_l, src54_l, src76_l;
 
 1626     v16i8 src21_l, src43_l, src65_l, src87_l;
 
 1627     v8i16 filt0, filt1, filt2, filt3;
 
 1628     v8i16 filter_vec, const_vec;
 
 1629     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 1630     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1632     src -= (3 * src_stride);
 
 1633     const_vec = __msa_ldi_h(128);
 
 1636     weight = weight & 0x0000FFFF;
 
 1637     weight_vec = __msa_fill_w(weight);
 
 1638     offset_vec = __msa_fill_w(offset);
 
 1639     rnd_vec = __msa_fill_w(rnd_val);
 
 1641     filter_vec = 
LD_SH(filter);
 
 1642     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1644     for (cnt = (width >> 4); cnt--;) {
 
 1648         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1649         src_tmp += (7 * src_stride);
 
 1651         ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1652                    src10_r, src32_r, src54_r, src21_r);
 
 1653         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1654         ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1655                    src10_l, src32_l, src54_l, src21_l);
 
 1656         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1658         for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 1659             LD_SB2(src_tmp, src_stride, src7, src8);
 
 1660             src_tmp += (2 * src_stride);
 
 1662             ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 1663             ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
 
 1667                          filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
 
 1670                          filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
 
 1673                          filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
 
 1676                          filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
 
 1679                                 weight_vec, offset_vec, rnd_vec,
 
 1680                                 dst0_r, dst1_r, dst2_r, dst3_r,
 
 1681                                 dst0_l, dst1_l, dst2_l, dst3_l);
 
 1684                             dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 1685             ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
 
 1686             dst_tmp += (2 * dst_stride);
 
 1719                                        filter, height, weight,
 
 1720                                        offset, rnd_val, 16);
 
 1734                                        filter, height, weight,
 
 1735                                        offset, rnd_val, 16);
 
 1738                              filter, height, weight, offset, rnd_val);
 
 1752                                        filter, height, weight,
 
 1753                                        offset, rnd_val, 32);
 
 1767                                        filter, height, weight,
 
 1768                                        offset, rnd_val, 48);
 
 1782                                        filter, height, weight,
 
 1783                                        offset, rnd_val, 64);
 
 1790                                      const int8_t *filter_x,
 
 1791                                      const int8_t *filter_y,
 
 1798     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1799     v8i16 filt0, filt1, filt2, filt3;
 
 1800     v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
 
 1801     v16i8 mask1, mask2, mask3;
 
 1802     v8i16 filter_vec, const_vec;
 
 1803     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1804     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1805     v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
 
 1806     v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
 
 1807     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 1808     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 1809     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
 
 1810     v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
 
 1812     src -= ((3 * src_stride) + 3);
 
 1813     filter_vec = 
LD_SH(filter_x);
 
 1814     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1816     filter_vec = 
LD_SH(filter_y);
 
 1817     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 1818     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 1820     SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 1826     const_vec = __msa_ldi_h(128);
 
 1829     weight_vec = __msa_fill_w(weight);
 
 1830     offset_vec = __msa_fill_w(offset);
 
 1831     rnd_vec = __msa_fill_w(rnd_val);
 
 1833     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1834     src += (7 * src_stride);
 
 1838     VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 1839     VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 1840     VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
 
 1841                vec8, vec9, vec10, vec11);
 
 1842     VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
 
 1843                vec12, vec13, vec14, vec15);
 
 1845     DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1846                  dst30, dst30, dst30, dst30);
 
 1848     DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
 
 1849                  dst41, dst41, dst41, dst41);
 
 1851     DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
 
 1852                  dst52, dst52, dst52, dst52);
 
 1854     DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
 
 1855                  dst63, dst63, dst63, dst63);
 
 1857     ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
 
 1858                dst10_r, dst21_r, dst32_r);
 
 1860     dst43_r = __msa_ilvl_h(dst41, dst30);
 
 1861     dst54_r = __msa_ilvl_h(dst52, dst41);
 
 1862     dst65_r = __msa_ilvl_h(dst63, dst52);
 
 1864     dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
 1866     for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1867         LD_SB2(src, src_stride, src7, src8);
 
 1868         src += (2 * src_stride);
 
 1871         VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
 
 1872                    vec0, vec1, vec2, vec3);
 
 1874         DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1875                      dst87, dst87, dst87, dst87);
 
 1876         dst76_r = __msa_ilvr_h(dst87, dst66);
 
 1878                                 filt_h0, filt_h1, filt_h2, filt_h3);
 
 1879         dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
 
 1881                                 filt_h0, filt_h1, filt_h2, filt_h3);
 
 1885         MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 1887         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
 
 1893         dst += (2 * dst_stride);
 
 1901         dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
 
 1909                                               const int8_t *filter_x,
 
 1910                                               const int8_t *filter_y,
 
 1917     uint32_t loop_cnt, cnt;
 
 1920     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1921     v8i16 filt0, filt1, filt2, filt3;
 
 1922     v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
 
 1923     v16i8 mask1, mask2, mask3;
 
 1924     v8i16 filter_vec, const_vec;
 
 1925     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1926     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1927     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 1928     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 1929     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 1930     v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
 
 1931     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 1932     v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
 
 1933     v4i32 weight_vec, offset_vec, rnd_vec;
 
 1934     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 1936     src -= ((3 * src_stride) + 3);
 
 1937     const_vec = __msa_ldi_h(128);
 
 1940     weight_vec = __msa_fill_w(weight);
 
 1941     offset_vec = __msa_fill_w(offset);
 
 1942     rnd_vec = __msa_fill_w(rnd_val);
 
 1944     filter_vec = 
LD_SH(filter_x);
 
 1945     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1947     filter_vec = 
LD_SH(filter_y);
 
 1948     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 1949     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 1950     SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 1956     for (cnt = width >> 3; cnt--;) {
 
 1960         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1961         src_tmp += (7 * src_stride);
 
 1964         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1965                    vec0, vec1, vec2, vec3);
 
 1966         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1967                    vec4, vec5, vec6, vec7);
 
 1968         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1969                    vec8, vec9, vec10, vec11);
 
 1970         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1971                    vec12, vec13, vec14, vec15);
 
 1973         DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1974                      dst0, dst0, dst0, dst0);
 
 1976         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
 
 1977                      dst1, dst1, dst1, dst1);
 
 1979         DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
 
 1980                      dst2, dst2, dst2, dst2);
 
 1982         DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
 
 1983                      dst3, dst3, dst3, dst3);
 
 1985         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 1986                    vec0, vec1, vec2, vec3);
 
 1987         VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
 
 1988                    vec4, vec5, vec6, vec7);
 
 1989         VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
 
 1990                    vec8, vec9, vec10, vec11);
 
 1992         DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1993                      dst4, dst4, dst4, dst4);
 
 1995         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
 
 1996                      dst5, dst5, dst5, dst5);
 
 1998         DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
 
 1999                      dst6, dst6, dst6, dst6);
 
 2001         ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
 
 2002                    dst10_r, dst32_r, dst54_r, dst21_r);
 
 2003         ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
 
 2004         ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
 
 2005                    dst10_l, dst32_l, dst54_l, dst21_l);
 
 2006         ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
 
 2008         for (loop_cnt = height >> 1; loop_cnt--;) {
 
 2009             LD_SB2(src_tmp, src_stride, src7, src8);
 
 2010             src_tmp += 2 * src_stride;
 
 2013             VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
 
 2014                        vec0, vec1, vec2, vec3);
 
 2016             DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 2017                          dst7, dst7, dst7, dst7);
 
 2021                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2023                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2028             VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
 
 2029                        vec0, vec1, vec2, vec3);
 
 2031             DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 2032                          dst8, dst8, dst8, dst8);
 
 2036                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2038                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2043                                    weight_vec, offset_vec, rnd_vec,
 
 2044                                    dst0_r, dst1_r, dst0_l, dst1_l);
 
 2047             ST8x2_UB(dst0_r, dst_tmp, dst_stride);
 
 2048             dst_tmp += (2 * dst_stride);
 
 2074                                      const int8_t *filter_x,
 
 2075                                      const int8_t *filter_y,
 
 2082                                       filter_x, filter_y, height, weight,
 
 2083                                       offset, rnd_val, 8);
 
 2090                                       const int8_t *filter_x,
 
 2091                                       const int8_t *filter_y,
 
 2098                                       filter_x, filter_y, height, weight,
 
 2099                                       offset, rnd_val, 8);
 
 2101                              filter_x, filter_y, height, weight, offset,
 
 2109                                       const int8_t *filter_x,
 
 2110                                       const int8_t *filter_y,
 
 2117                                       filter_x, filter_y, height, weight,
 
 2118                                       offset, rnd_val, 16);
 
 2125                                       const int8_t *filter_x,
 
 2126                                       const int8_t *filter_y,
 
 2133                                       filter_x, filter_y, height, weight,
 
 2134                                       offset, rnd_val, 24);
 
 2141                                       const int8_t *filter_x,
 
 2142                                       const int8_t *filter_y,
 
 2149                                       filter_x, filter_y, height, weight,
 
 2150                                       offset, rnd_val, 32);
 
 2157                                       const int8_t *filter_x,
 
 2158                                       const int8_t *filter_y,
 
 2165                                       filter_x, filter_y, height, weight,
 
 2166                                       offset, rnd_val, 48);
 
 2173                                       const int8_t *filter_x,
 
 2174                                       const int8_t *filter_y,
 
 2181                                       filter_x, filter_y, height, weight,
 
 2182                                       offset, rnd_val, 64);
 
 2199     v4i32 dst0_r, dst0_l;
 
 2200     v8i16 filter_vec, const_vec;
 
 2201     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2202     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
 
 2206     filter_vec = 
LD_SH(filter);
 
 2211     weight = weight & 0x0000FFFF;
 
 2213     const_vec = __msa_ldi_h(128);
 
 2216     weight_vec = __msa_fill_w(weight);
 
 2217     offset_vec = __msa_fill_w(offset);
 
 2218     rnd_vec = __msa_fill_w(rnd_val);
 
 2220     LD_SB2(src, src_stride, src0, src1);
 
 2223     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 2228     DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 
 2230     ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
 
 2236     dst += (4 * dst_stride);
 
 2251     v16i8 mask1, vec0, vec1;
 
 2253     v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
 
 2254     v8i16 filter_vec, const_vec;
 
 2255     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2256     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
 
 2261     filter_vec = 
LD_SH(filter);
 
 2266     weight = weight & 0x0000FFFF;
 
 2268     const_vec = __msa_ldi_h(128);
 
 2271     weight_vec = __msa_fill_w(weight);
 
 2272     offset_vec = __msa_fill_w(offset);
 
 2273     rnd_vec = __msa_fill_w(rnd_val);
 
 2275     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2278     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 2282     VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
 
 2287                         dst0_r, dst1_r, dst0_l, dst1_l);
 
 2290     ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
 
 2291     dst += (4 * dst_stride);
 
 2306     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2307     v16i8 mask1, vec0, vec1;
 
 2308     v8i16 dst0, dst1, dst2, dst3;
 
 2309     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 2310     v8i16 filter_vec, const_vec;
 
 2311     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2312     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
 
 2316     filter_vec = 
LD_SH(filter);
 
 2319     weight = weight & 0x0000FFFF;
 
 2320     const_vec = __msa_ldi_h(128);
 
 2323     weight_vec = __msa_fill_w(weight);
 
 2324     offset_vec = __msa_fill_w(offset);
 
 2325     rnd_vec = __msa_fill_w(rnd_val);
 
 2329     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 2330         LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2331         src += (8 * src_stride);
 
 2335         VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 2339         VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
 
 2343         VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
 
 2347         VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
 
 2352                             weight_vec, offset_vec, rnd_vec,
 
 2353                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2354                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2357                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 2358         ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
 
 2359         dst += (8 * dst_stride);
 
 2375                                   filter, height, weight, offset, rnd_val);
 
 2376     } 
else if (4 == height) {
 
 2378                                   filter, height, weight, offset, rnd_val);
 
 2379     } 
else if (8 == height || 16 == height) {
 
 2381                                           filter, height, weight,
 
 2399     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2402     v8i16 dst0, dst1, dst2, dst3;
 
 2403     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 2404     v8i16 filter_vec, const_vec;
 
 2405     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2409     filter_vec = 
LD_SH(filter);
 
 2412     weight = weight & 0x0000FFFF;
 
 2413     const_vec = __msa_ldi_h(128);
 
 2416     weight_vec = __msa_fill_w(weight);
 
 2417     offset_vec = __msa_fill_w(offset);
 
 2418     rnd_vec = __msa_fill_w(rnd_val);
 
 2422     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2423         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2424         src += (4 * src_stride);
 
 2428         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2432         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2436         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 2440         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2445                             weight_vec, offset_vec, rnd_vec,
 
 2446                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2447                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2450                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 2452         ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 2453         dst += (4 * dst_stride);
 
 2467     v8i16 filt0, filt1, dst0, dst1;
 
 2469     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2472     v8i16 filter_vec, const_vec;
 
 2473     v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
 
 2474     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2478     filter_vec = 
LD_SH(filter);
 
 2481     weight = weight & 0x0000FFFF;
 
 2482     const_vec = __msa_ldi_h(128);
 
 2485     weight_vec = __msa_fill_w(weight);
 
 2486     offset_vec = __msa_fill_w(offset);
 
 2487     rnd_vec = __msa_fill_w(rnd_val);
 
 2491     LD_SB2(src, src_stride, src0, src1);
 
 2494     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2497     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2502                         dst0_r, dst1_r, dst0_l, dst1_l);
 
 2519     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 2520     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2523     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 2524     v8i16 filter_vec, const_vec;
 
 2525     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
 
 2526     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
 
 2527     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2531     filter_vec = 
LD_SH(filter);
 
 2534     weight = weight & 0x0000FFFF;
 
 2535     const_vec = __msa_ldi_h(128);
 
 2538     weight_vec = __msa_fill_w(weight);
 
 2539     offset_vec = __msa_fill_w(offset);
 
 2540     rnd_vec = __msa_fill_w(rnd_val);
 
 2544     LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
 
 2545     LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
 
 2548     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2552     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2556     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 2560     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2564     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 2568     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 2573                         weight_vec, offset_vec, rnd_vec,
 
 2574                         dst0_r, dst1_r, dst2_r, dst3_r,
 
 2575                         dst0_l, dst1_l, dst2_l, dst3_l);
 
 2578                         dst4_r, dst5_r, dst4_l, dst5_l);
 
 2581                      dst2_l, dst2_r, dst3_l, dst3_r,
 
 2582                      dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
 
 2584     ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 2585     dst += (4 * dst_stride);
 
 2602     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2605     v8i16 dst0, dst1, dst2, dst3;
 
 2606     v8i16 filter_vec, const_vec;
 
 2607     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 2608     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2612     filter_vec = 
LD_SH(filter);
 
 2615     weight = weight & 0x0000FFFF;
 
 2616     const_vec = __msa_ldi_h(128);
 
 2619     weight_vec = __msa_fill_w(weight);
 
 2620     offset_vec = __msa_fill_w(offset);
 
 2621     rnd_vec = __msa_fill_w(rnd_val);
 
 2625     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2626         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2627         src += (4 * src_stride);
 
 2631         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2635         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2639         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 2643         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2648                             weight_vec, offset_vec, rnd_vec,
 
 2649                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2650                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2653                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 2655         ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 2656         dst += (4 * dst_stride);
 
 2672                                   filter, height, weight, offset, rnd_val);
 
 2673     } 
else if (6 == height) {
 
 2675                                   filter, height, weight, offset, rnd_val);
 
 2678                                           filter, height, weight, offset,
 
 2696     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2697     v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 
 2701     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 2702     v8i16 filter_vec, const_vec;
 
 2704     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
 
 2705     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
 
 2706     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2710     filter_vec = 
LD_SH(filter);
 
 2713     weight = weight & 0x0000FFFF;
 
 2714     const_vec = __msa_ldi_h(128);
 
 2717     weight_vec = __msa_fill_w(weight);
 
 2718     offset_vec = __msa_fill_w(offset);
 
 2719     rnd_vec = __msa_fill_w(rnd_val);
 
 2724     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2725         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2726         src += (4 * src_stride);
 
 2730         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2734         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2738         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 2742         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2746         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
 
 2750         VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
 
 2755                             weight_vec, offset_vec, rnd_vec,
 
 2756                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2757                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2760                             dst4_r, dst5_r, dst4_l, dst5_l);
 
 2763                          dst2_l, dst2_r, dst3_l, dst3_r,
 
 2764                          dst4_l, dst4_r, dst5_l, dst5_r,
 
 2765                          dst0_r, dst1_r, dst2_r);
 
 2767         ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
 
 2768         dst += (4 * dst_stride);
 
 2783     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2785     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2787     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2789     v8i16 filter_vec, const_vec;
 
 2790     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 2791     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2795     filter_vec = 
LD_SH(filter);
 
 2798     weight = weight & 0x0000FFFF;
 
 2799     const_vec = __msa_ldi_h(128);
 
 2802     weight_vec = __msa_fill_w(weight);
 
 2803     offset_vec = __msa_fill_w(offset);
 
 2804     rnd_vec = __msa_fill_w(rnd_val);
 
 2808     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2809         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 2810         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 2811         src += (4 * src_stride);
 
 2815         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2819         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2823         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 2827         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2831         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 2835         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 2839         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 2843         VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 2848                             weight_vec, offset_vec, rnd_vec,
 
 2849                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2850                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2853                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 2854         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 2855         dst += (2 * dst_stride);
 
 2858                             weight_vec, offset_vec, rnd_vec,
 
 2859                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2860                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2863                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 2864         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 2865         dst += (2 * dst_stride);
 
 2883     v8i16 dst0, dst1, dst2, dst3;
 
 2884     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2885     v16i8 mask1, mask2, mask3;
 
 2887     v8i16 filter_vec, const_vec;
 
 2888     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 2889     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2893     filter_vec = 
LD_SH(filter);
 
 2896     weight = weight & 0x0000FFFF;
 
 2897     const_vec = __msa_ldi_h(128);
 
 2900     weight_vec = __msa_fill_w(weight);
 
 2901     offset_vec = __msa_fill_w(offset);
 
 2902     rnd_vec = __msa_fill_w(rnd_val);
 
 2908     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 2910         LD_SB2(src, src_stride, src0, src2);
 
 2911         LD_SB2(src + 16, src_stride, src1, src3);
 
 2912         src += (2 * src_stride);
 
 2916         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2920         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
 
 2924         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 2928         VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
 
 2933                             weight_vec, offset_vec, rnd_vec,
 
 2934                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 2935                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 2938                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 2939         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 2940         dst += (2 * dst_stride);
 
 2943         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 2947         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2952                             dst0_r, dst1_r, dst0_l, dst1_l);
 
 2955         ST8x2_UB(dst0_r, dst_tmp, dst_stride);
 
 2956         dst_tmp += (2 * dst_stride);
 
 2973     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2974     v16i8 mask1, mask2, mask3;
 
 2975     v8i16 dst0, dst1, dst2, dst3;
 
 2977     v8i16 filter_vec, const_vec;
 
 2978     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 2979     v4i32 weight_vec, offset_vec, rnd_vec;
 
 2983     filter_vec = 
LD_SH(filter);
 
 2986     weight = weight & 0x0000FFFF;
 
 2987     const_vec = __msa_ldi_h(128);
 
 2990     weight_vec = __msa_fill_w(weight);
 
 2991     offset_vec = __msa_fill_w(offset);
 
 2992     rnd_vec = __msa_fill_w(rnd_val);
 
 2998     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 2999         LD_SB2(src, 16, src0, src1);
 
 3000         src2 = 
LD_SB(src + 24);
 
 3005         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3009         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
 
 3013         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 3017         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 3022                             weight_vec, offset_vec, rnd_vec,
 
 3023                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3024                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3027                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3028         ST_SW2(dst0_r, dst1_r, dst, 16);
 
 3031         LD_SB2(src, 16, src0, src1);
 
 3032         src2 = 
LD_SB(src + 24);
 
 3037         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3041         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
 
 3045         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 3049         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
 
 3054                             weight_vec, offset_vec, rnd_vec,
 
 3055                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3056                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3059                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3060         ST_SW2(dst0_r, dst1_r, dst, 16);
 
 3075     v16i8 
src0, 
src1, src2, src3, src4;
 
 3076     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3077     v16i8 src2110, src4332;
 
 3079     v4i32 dst0_r, dst0_l;
 
 3081     v8i16 filter_vec, const_vec;
 
 3082     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3086     const_vec = __msa_ldi_h(128);
 
 3088     weight = weight & 0x0000FFFF;
 
 3090     weight_vec = __msa_fill_w(weight);
 
 3091     offset_vec = __msa_fill_w(offset);
 
 3092     rnd_vec = __msa_fill_w(rnd_val);
 
 3094     filter_vec = 
LD_SH(filter);
 
 3097     LD_SB3(src, src_stride, src0, src1, src2);
 
 3098     src += (3 * src_stride);
 
 3099     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3100     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3101     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3102     LD_SB2(src, src_stride, src3, src4);
 
 3103     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3104     src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
 
 3105     src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
 
 3108     DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
 
 3111     DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 
 3113     ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
 
 3131     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 3132     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
 
 3133     v16i8 src2110, src4332, src6554;
 
 3135     v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
 
 3137     v8i16 filter_vec, const_vec;
 
 3138     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3142     const_vec = __msa_ldi_h(128);
 
 3144     weight = weight & 0x0000FFFF;
 
 3146     weight_vec = __msa_fill_w(weight);
 
 3147     offset_vec = __msa_fill_w(offset);
 
 3148     rnd_vec = __msa_fill_w(rnd_val);
 
 3150     filter_vec = 
LD_SH(filter);
 
 3153     LD_SB3(src, src_stride, src0, src1, src2);
 
 3154     src += (3 * src_stride);
 
 3155     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3156     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3157     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3159     LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 3160     ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 3161                src32_r, src43_r, src54_r, src65_r);
 
 3162     ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
 
 3166     DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
 
 3168     DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
 
 3170                         dst0_r, dst1_r, dst0_l, dst1_l);
 
 3173     ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
 
 3174     dst += (4 * dst_stride);
 
 3188     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9;
 
 3189     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 3190     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 3191     v16i8 src2110, src4332, src6554, src8776;
 
 3192     v8i16 dst10, dst32, dst54, dst76;
 
 3193     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 3195     v8i16 filter_vec, const_vec;
 
 3196     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3200     const_vec = __msa_ldi_h(128);
 
 3202     weight = weight & 0x0000FFFF;
 
 3204     weight_vec = __msa_fill_w(weight);
 
 3205     offset_vec = __msa_fill_w(offset);
 
 3206     rnd_vec = __msa_fill_w(rnd_val);
 
 3208     filter_vec = 
LD_SH(filter);
 
 3211     LD_SB3(src, src_stride, src0, src1, src2);
 
 3212     src += (3 * src_stride);
 
 3213     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3214     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3215     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3217     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 3218         LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
 
 3219         src += (6 * src_stride);
 
 3220         ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 3221                    src32_r, src43_r, src54_r, src65_r);
 
 3222         ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3223         ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
 
 3224                    src4332, src6554, src8776);
 
 3228         DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
 
 3230         DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
 
 3232         DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
 
 3234         LD_SB2(src, src_stride, src9, src2);
 
 3235         src += (2 * src_stride);
 
 3236         ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
 
 3237         src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
 
 3238         src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3241         DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
 
 3243                             weight_vec, offset_vec, rnd_vec,
 
 3244                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3245                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3248                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3249         ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
 
 3250         dst += (8 * dst_stride);
 
 3266                                   filter, height, weight, offset, rnd_val);
 
 3267     } 
else if (4 == height) {
 
 3269                                   filter, height, weight, offset, rnd_val);
 
 3270     } 
else if (0 == (height % 8)) {
 
 3272                                           filter, height, weight, offset,
 
 3288     v16i8 
src0, 
src1, src2, src3, src4;
 
 3289     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3290     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 3292     v8i16 filter_vec, const_vec;
 
 3293     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3294     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 3298     const_vec = __msa_ldi_h(128);
 
 3300     weight = weight & 0x0000FFFF;
 
 3302     weight_vec = __msa_fill_w(weight);
 
 3303     offset_vec = __msa_fill_w(offset);
 
 3304     rnd_vec = __msa_fill_w(rnd_val);
 
 3306     filter_vec = 
LD_SH(filter);
 
 3309     LD_SB3(src, src_stride, src0, src1, src2);
 
 3310     src += (3 * src_stride);
 
 3312     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3314     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3315         LD_SB2(src, src_stride, src3, src4);
 
 3316         src += (2 * src_stride);
 
 3318         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3321         DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3323         DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3325         LD_SB2(src, src_stride, src1, src2);
 
 3326         src += (2 * src_stride);
 
 3328         ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
 
 3331         DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
 
 3333         DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
 
 3335                             weight_vec, offset_vec, rnd_vec,
 
 3336                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3337                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3340                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3342         ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 3343         dst += (4 * dst_stride);
 
 3357     v16i8 
src0, 
src1, src2, src3, src4;
 
 3358     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3361     v8i16 filter_vec, const_vec;
 
 3362     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3363     v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
 
 3367     const_vec = __msa_ldi_h(128);
 
 3369     weight = weight & 0x0000FFFF;
 
 3371     weight_vec = __msa_fill_w(weight);
 
 3372     offset_vec = __msa_fill_w(offset);
 
 3373     rnd_vec = __msa_fill_w(rnd_val);
 
 3375     filter_vec = 
LD_SH(filter);
 
 3378     LD_SB3(src, src_stride, src0, src1, src2);
 
 3379     src += (3 * src_stride);
 
 3381     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3382     LD_SB2(src, src_stride, src3, src4);
 
 3384     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3387     DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3389     DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3391                         dst0_r, dst1_r, dst0_l, dst1_l);
 
 3407     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 3408     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 3409     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 3410     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 3412     v8i16 filter_vec, const_vec;
 
 3413     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3414     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
 
 3415     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
 
 3419     const_vec = __msa_ldi_h(128);
 
 3421     weight = weight & 0x0000FFFF;
 
 3423     weight_vec = __msa_fill_w(weight);
 
 3424     offset_vec = __msa_fill_w(offset);
 
 3425     rnd_vec = __msa_fill_w(rnd_val);
 
 3427     filter_vec = 
LD_SH(filter);
 
 3430     LD_SB3(src, src_stride, src0, src1, src2);
 
 3431     src += (3 * src_stride);
 
 3433     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3435     LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
 
 3437     ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 3438                src32_r, src43_r, src54_r, src65_r);
 
 3439     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3442     DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3444     DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3446     DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
 
 3448     DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
 
 3450     DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
 
 3452     DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
 
 3454                         weight_vec, offset_vec, rnd_vec,
 
 3455                         dst0_r, dst1_r, dst2_r, dst3_r,
 
 3456                         dst0_l, dst1_l, dst2_l, dst3_l);
 
 3458                         dst4_r, dst5_r, dst4_l, dst5_l);
 
 3461                      dst2_l, dst2_r, dst3_l, dst3_r,
 
 3462                      dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
 
 3463     ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 3464     dst += (4 * dst_stride);
 
 3479     v16i8 
src0, 
src1, src2, src3, src4;
 
 3480     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3481     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 3483     v8i16 filter_vec, const_vec;
 
 3484     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3485     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 3489     const_vec = __msa_ldi_h(128);
 
 3491     weight = weight & 0x0000FFFF;
 
 3493     weight_vec = __msa_fill_w(weight);
 
 3494     offset_vec = __msa_fill_w(offset);
 
 3495     rnd_vec = __msa_fill_w(rnd_val);
 
 3497     filter_vec = 
LD_SH(filter);
 
 3500     LD_SB3(src, src_stride, src0, src1, src2);
 
 3501     src += (3 * src_stride);
 
 3503     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3505     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3506         LD_SB2(src, src_stride, src3, src4);
 
 3507         src += (2 * src_stride);
 
 3509         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3512         DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3514         DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3516         LD_SB2(src, src_stride, src1, src2);
 
 3517         src += (2 * src_stride);
 
 3519         ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
 
 3522         DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
 
 3524         DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
 
 3526                             weight_vec, offset_vec, rnd_vec,
 
 3527                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3528                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3531                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3532         ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 3533         dst += (4 * dst_stride);
 
 3549                                   filter, height, weight, offset, rnd_val);
 
 3550     } 
else if (6 == height) {
 
 3552                                   filter, height, weight, offset, rnd_val);
 
 3555                                           filter, height, weight, offset,
 
 3571     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3572     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3573     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 3574     v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
 
 3575     v16i8 src2110, src4332;
 
 3577     v8i16 filter_vec, const_vec;
 
 3578     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3579     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
 
 3580     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
 
 3582     src -= (1 * src_stride);
 
 3584     const_vec = __msa_ldi_h(128);
 
 3586     weight = weight & 0x0000FFFF;
 
 3588     weight_vec = __msa_fill_w(weight);
 
 3589     offset_vec = __msa_fill_w(offset);
 
 3590     rnd_vec = __msa_fill_w(rnd_val);
 
 3592     filter_vec = 
LD_SH(filter);
 
 3595     LD_SB3(src, src_stride, src0, src1, src2);
 
 3596     src += (3 * src_stride);
 
 3598     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3599     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 3600     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
 
 3602     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3603         LD_SB2(src, src_stride, src3, src4);
 
 3604         src += (2 * src_stride);
 
 3606         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3607         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 3608         src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
 
 3611         DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3613         DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3615         DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
 
 3617         LD_SB2(src, src_stride, src5, src2);
 
 3618         src += (2 * src_stride);
 
 3620         ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
 
 3621         ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
 
 3622         src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
 
 3625         DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
 
 3627         DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
 
 3629         DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
 
 3631                             weight_vec, offset_vec, rnd_vec,
 
 3632                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3633                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3635                             dst4_r, dst5_r, dst4_l, dst5_l);
 
 3638                          dst2_l, dst2_r, dst3_l, dst3_r,
 
 3639                          dst4_l, dst4_r, dst5_l, dst5_r,
 
 3640                          dst0_r, dst1_r, dst2_r);
 
 3641         ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
 
 3642         dst += (4 * dst_stride);
 
 3657     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3658     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3659     v16i8 src10_l, src32_l, src21_l, src43_l;
 
 3660     v8i16 tmp0, tmp1, tmp2, tmp3;
 
 3662     v8i16 filter_vec, const_vec;
 
 3663     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3664     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
 
 3668     const_vec = __msa_ldi_h(128);
 
 3670     weight = weight & 0x0000FFFF;
 
 3672     weight_vec = __msa_fill_w(weight);
 
 3673     offset_vec = __msa_fill_w(offset);
 
 3674     rnd_vec = __msa_fill_w(rnd_val);
 
 3676     filter_vec = 
LD_SH(filter);
 
 3679     LD_SB3(src, src_stride, src0, src1, src2);
 
 3680     src += (3 * src_stride);
 
 3682     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3683     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 3685     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3686         LD_SB2(src, src_stride, src3, src4);
 
 3687         src += (2 * src_stride);
 
 3689         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3690         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 3693         DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3695         DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3697         DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
 
 3699         DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
 
 3701                             weight_vec, offset_vec, rnd_vec,
 
 3702                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3703                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3706                         dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3707         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 3708         dst += (2 * dst_stride);
 
 3710         LD_SB2(src, src_stride, src5, src2);
 
 3711         src += (2 * src_stride);
 
 3713         ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
 
 3714         ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
 
 3717         DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
 
 3719         DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
 
 3721         DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
 
 3723         DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
 
 3725                             weight_vec, offset_vec, rnd_vec,
 
 3726                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3727                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3730                         dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3731         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 3732         dst += (2 * dst_stride);
 
 3747     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3748     v16i8 src6, src7, src8, src9, src10, src11;
 
 3749     v16i8 src10_r, src32_r, src76_r, src98_r;
 
 3750     v16i8 src21_r, src43_r, src87_r, src109_r;
 
 3751     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 3752     v16i8 src10_l, src32_l, src21_l, src43_l;
 
 3754     v8i16 filter_vec, const_vec;
 
 3755     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3756     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
 
 3757     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
 
 3761     const_vec = __msa_ldi_h(128);
 
 3763     weight = weight & 0x0000FFFF;
 
 3765     weight_vec = __msa_fill_w(weight);
 
 3766     offset_vec = __msa_fill_w(offset);
 
 3767     rnd_vec = __msa_fill_w(rnd_val);
 
 3769     filter_vec = 
LD_SH(filter);
 
 3772     LD_SB3(src, src_stride, src0, src1, src2);
 
 3774     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3775     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 3777     LD_SB3(src + 16, src_stride, src6, src7, src8);
 
 3778     src += (3 * src_stride);
 
 3780     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3782     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3783         LD_SB2(src, src_stride, src3, src4);
 
 3785         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3786         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 3787         LD_SB2(src + 16, src_stride, src9, src10);
 
 3788         src += (2 * src_stride);
 
 3790         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 3793         DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3795         DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
 
 3797         DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3799         DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
 
 3801         DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
 
 3803         DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
 
 3806                             weight_vec, offset_vec, rnd_vec,
 
 3807                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3808                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3810                             dst4_r, dst5_r, dst4_l, dst5_l);
 
 3813                         dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3815         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 3816         ST8x2_UB(dst4_r, dst + 16, dst_stride);
 
 3817         dst += (2 * dst_stride);
 
 3819         LD_SB2(src, src_stride, src5, src2);
 
 3821         ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
 
 3822         ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
 
 3823         LD_SB2(src + 16, src_stride, src11, src8);
 
 3824         src += (2 * src_stride);
 
 3826         ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
 
 3829         DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
 
 3831         DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
 
 3833         DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
 
 3835         DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
 
 3837         DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
 
 3839         DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
 
 3842                             weight_vec, offset_vec, rnd_vec,
 
 3843                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3844                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3846                             dst4_r, dst5_r, dst4_l, dst5_l);
 
 3849                         dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3851         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 3852         ST8x2_UB(dst4_r, dst + 16, dst_stride);
 
 3853         dst += (2 * dst_stride);
 
 3869     v16i8 
src0, 
src1, src2, src3, src4, src6, src7, src8, src9, src10;
 
 3870     v16i8 src10_r, src32_r, src76_r, src98_r;
 
 3871     v16i8 src21_r, src43_r, src87_r, src109_r;
 
 3872     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 3873     v16i8 src10_l, src32_l, src76_l, src98_l;
 
 3874     v16i8 src21_l, src43_l, src87_l, src109_l;
 
 3876     v8i16 filter_vec, const_vec;
 
 3877     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3878     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
 
 3879     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
 
 3883     const_vec = __msa_ldi_h(128);
 
 3885     weight = weight & 0x0000FFFF;
 
 3887     weight_vec = __msa_fill_w(weight);
 
 3888     offset_vec = __msa_fill_w(offset);
 
 3889     rnd_vec = __msa_fill_w(rnd_val);
 
 3891     filter_vec = 
LD_SH(filter);
 
 3894     LD_SB3(src, src_stride, src0, src1, src2);
 
 3896     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3897     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 3899     LD_SB3(src + 16, src_stride, src6, src7, src8);
 
 3900     src += (3 * src_stride);
 
 3902     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3903     ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
 
 3905     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 3906         LD_SB2(src, src_stride, src3, src4);
 
 3908         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3909         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 3912         DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
 
 3914         DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
 
 3916         DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
 
 3918         DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
 
 3921                             weight_vec, offset_vec, rnd_vec,
 
 3922                             dst0_r, dst1_r, dst2_r, dst3_r,
 
 3923                             dst0_l, dst1_l, dst2_l, dst3_l);
 
 3925                         dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 3926         ST_SW2(dst0_r, dst1_r, dst, dst_stride);
 
 3927         dst += (2 * dst_stride);
 
 3935         LD_SB2(src + 16, src_stride, src9, src10);
 
 3936         src += (2 * src_stride);
 
 3938         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 3939         ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
 
 3942         DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
 
 3944         DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
 
 3946         DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
 
 3948         DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
 
 3951                             weight_vec, offset_vec, rnd_vec,
 
 3952                             dst4_r, dst5_r, dst6_r, dst7_r,
 
 3953                             dst4_l, dst5_l, dst6_l, dst7_l);
 
 3956                         dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
 
 3957         ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
 
 3958         dst_tmp += (2 * dst_stride);
 
 3972                                       const int8_t *filter_x,
 
 3973                                       const int8_t *filter_y,
 
 3979     v16i8 
src0, 
src1, src2, src3, src4;
 
 3981     v4i32 filt_h0, filt_h1;
 
 3982     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3984     v8i16 filter_vec, const_vec;
 
 3985     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3986     v8i16 dst0, dst1, dst2, dst3, dst4;
 
 3987     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 3988     v4i32 dst0_r, dst1_r;
 
 3989     v4i32 weight_vec, offset_vec, rnd_vec;
 
 3991     src -= (src_stride + 1);
 
 3993     filter_vec = 
LD_SH(filter_x);
 
 3996     filter_vec = 
LD_SH(filter_y);
 
 3997     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3998     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4004     const_vec = __msa_ldi_h(128);
 
 4007     weight_vec = __msa_fill_w(weight);
 
 4008     offset_vec = __msa_fill_w(offset);
 
 4009     rnd_vec = __msa_fill_w(rnd_val);
 
 4011     LD_SB3(src, src_stride, src0, src1, src2);
 
 4012     src += (3 * src_stride);
 
 4015     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4016     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4017     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4025     ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
 
 4026     LD_SB2(src, src_stride, src3, src4);
 
 4030     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4034     dst32_r = __msa_ilvr_h(dst3, dst2);
 
 4038     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4042     dst43_r = __msa_ilvr_h(dst4, dst3);
 
 4046     MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 4048     ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
 
 4060                                       const int8_t *filter_x,
 
 4061                                       const int8_t *filter_y,
 
 4067     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 4069     v4i32 filt_h0, filt_h1;
 
 4070     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 4072     v8i16 filter_vec, const_vec;
 
 4073     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4074     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 4075     v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
 
 4076     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 4077     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4079     src -= (src_stride + 1);
 
 4081     filter_vec = 
LD_SH(filter_x);
 
 4084     filter_vec = 
LD_SH(filter_y);
 
 4085     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 4086     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4092     const_vec = __msa_ldi_h(128);
 
 4095     weight_vec = __msa_fill_w(weight);
 
 4096     offset_vec = __msa_fill_w(offset);
 
 4097     rnd_vec = __msa_fill_w(rnd_val);
 
 4099     LD_SB3(src, src_stride, src0, src1, src2);
 
 4100     src += (3 * src_stride);
 
 4103     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4104     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4105     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4113     ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
 
 4115     LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 4119     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4122     dst32_r = __msa_ilvr_h(dst3, dst2);
 
 4127     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4130     dst43_r = __msa_ilvr_h(dst4, dst3);
 
 4135     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 4138     dst10_r = __msa_ilvr_h(dst5, dst4);
 
 4143     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 4146     dst21_r = __msa_ilvr_h(dst2, dst5);
 
 4151                            weight_vec, offset_vec, rnd_vec,
 
 4152                            dst0_r, dst1_r, dst2_r, dst3_r);
 
 4154     ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
 
 4161                                               const int8_t *filter_x,
 
 4162                                               const int8_t *filter_y,
 
 4169     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4171     v4i32 filt_h0, filt_h1;
 
 4172     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 4174     v8i16 filter_vec, const_vec;
 
 4175     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4176     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
 
 4177     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
 
 4178     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 4179     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 4180     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4182     src -= (src_stride + 1);
 
 4184     filter_vec = 
LD_SH(filter_x);
 
 4187     filter_vec = 
LD_SH(filter_y);
 
 4188     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 4189     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4195     const_vec = __msa_ldi_h(128);
 
 4198     weight_vec = __msa_fill_w(weight);
 
 4199     offset_vec = __msa_fill_w(offset);
 
 4200     rnd_vec = __msa_fill_w(rnd_val);
 
 4202     LD_SB3(src, src_stride, src0, src1, src2);
 
 4203     src += (3 * src_stride);
 
 4206     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4207     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4208     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4215     ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
 
 4217     for (loop_cnt = height >> 3; loop_cnt--;) {
 
 4219                src3, src4, src5, src6, src7, src8, src9, src10);
 
 4220         src += (8 * src_stride);
 
 4223         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4226         dst32_r = __msa_ilvr_h(dst3, dst2);
 
 4230         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4233         dst43_r = __msa_ilvr_h(dst4, dst3);
 
 4237         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 4240         dst54_r = __msa_ilvr_h(dst5, dst4);
 
 4244         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 4247         dst65_r = __msa_ilvr_h(dst6, dst5);
 
 4251         VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 4254         dst76_r = __msa_ilvr_h(dst7, dst6);
 
 4258         VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
 
 4261         dst87_r = __msa_ilvr_h(dst8, dst7);
 
 4265         VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
 
 4268         dst10_r = __msa_ilvr_h(dst9, dst8);
 
 4272         VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
 
 4275         dst21_r = __msa_ilvr_h(dst2, dst9);
 
 4280                                weight_vec, offset_vec, rnd_vec,
 
 4281                                dst0_r, dst1_r, dst2_r, dst3_r);
 
 4283         ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
 
 4284         dst += (4 * dst_stride);
 
 4287                                weight_vec, offset_vec, rnd_vec,
 
 4288                                dst4_r, dst5_r, dst6_r, dst7_r);
 
 4290         ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
 
 4291         dst += (4 * dst_stride);
 
 4299                                      const int8_t *filter_x,
 
 4300                                      const int8_t *filter_y,
 
 4308                                   filter_x, filter_y, height, weight,
 
 4310     } 
else if (4 == height) {
 
 4312                                   filter_x, filter_y, height, weight,
 
 4314     } 
else if (0 == (height % 8)) {
 
 4316                                           filter_x, filter_y, height, weight,
 
 4325                                      const int8_t *filter_x,
 
 4326                                      const int8_t *filter_y,
 
 4333     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 4335     v4i32 filt_h0, filt_h1;
 
 4336     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 4338     v8i16 filter_vec, const_vec;
 
 4339     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4340     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 4341     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 4342     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4343     v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
 
 4344     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 4345     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 4347     src -= (src_stride + 1);
 
 4349     filter_vec = 
LD_SH(filter_x);
 
 4352     filter_vec = 
LD_SH(filter_y);
 
 4353     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 4354     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4360     const_vec = __msa_ldi_h(128);
 
 4363     weight_vec = __msa_fill_w(weight);
 
 4364     offset_vec = __msa_fill_w(offset);
 
 4365     rnd_vec = __msa_fill_w(rnd_val);
 
 4367     LD_SB3(src, src_stride, src0, src1, src2);
 
 4368     src += (3 * src_stride);
 
 4371     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4372     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4373     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4384     for (loop_cnt = height >> 2; loop_cnt--;) {
 
 4385         LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 4386         src += (4 * src_stride);
 
 4390         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4400         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4410         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 4420         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 4430                                weight_vec, offset_vec, rnd_vec,
 
 4431                                dst0_r, dst1_r, dst0_l, dst1_l);
 
 4433                                weight_vec, offset_vec, rnd_vec,
 
 4434                                dst2_r, dst3_r, dst2_l, dst3_l);
 
 4436                         dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 4437         ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 4438         dst += (4 * dst_stride);
 
 4446                                       const int8_t *filter_x,
 
 4447                                       const int8_t *filter_y,
 
 4453     v16i8 
src0, 
src1, src2, src3, src4;
 
 4455     v4i32 filt_h0, filt_h1;
 
 4456     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 4458     v8i16 filter_vec, const_vec;
 
 4459     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4460     v8i16 dst0, dst1, dst2, dst3, dst4;
 
 4461     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 4462     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 4463     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 4464     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4466     src -= (src_stride + 1);
 
 4468     filter_vec = 
LD_SH(filter_x);
 
 4471     filter_vec = 
LD_SH(filter_y);
 
 4472     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 4473     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4479     const_vec = __msa_ldi_h(128);
 
 4482     weight_vec = __msa_fill_w(weight);
 
 4483     offset_vec = __msa_fill_w(offset);
 
 4484     rnd_vec = __msa_fill_w(rnd_val);
 
 4486     LD_SB3(src, src_stride, src0, src1, src2);
 
 4487     src += (3 * src_stride);
 
 4490     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4491     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4492     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4503     LD_SB2(src, src_stride, src3, src4);
 
 4504     src += (2 * src_stride);
 
 4507     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4516     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4526                            weight_vec, offset_vec, rnd_vec,
 
 4527                            dst0_r, dst1_r, dst0_l, dst1_l);
 
 4530     dst += (2 * dst_stride);
 
 4537                                       const int8_t *filter_x,
 
 4538                                       const int8_t *filter_y,
 
 4544     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 4546     v4i32 filt_h0, filt_h1;
 
 4547     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 4549     v8i16 filter_vec, const_vec;
 
 4550     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4551     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 4552     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 4553     v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
 
 4554     v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
 
 4555     v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
 
 4556     v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
 
 4557     v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
 
 4558     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4560     src -= (src_stride + 1);
 
 4562     filter_vec = 
LD_SH(filter_x);
 
 4565     filter_vec = 
LD_SH(filter_y);
 
 4566     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 4567     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4573     const_vec = __msa_ldi_h(128);
 
 4576     weight_vec = __msa_fill_w(weight);
 
 4577     offset_vec = __msa_fill_w(offset);
 
 4578     rnd_vec = __msa_fill_w(rnd_val);
 
 4580     LD_SB3(src, src_stride, src0, src1, src2);
 
 4581     src += (3 * src_stride);
 
 4585     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4586     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4587     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4598     LD_SB2(src, src_stride, src3, src4);
 
 4599     src += (2 * src_stride);
 
 4603     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4613     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4622     LD_SB2(src, src_stride, src5, src6);
 
 4623     src += (2 * src_stride);
 
 4627     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 4637     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 4646     LD_SB2(src, src_stride, src7, src8);
 
 4647     src += (2 * src_stride);
 
 4651     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 4662     VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
 
 4672                            weight_vec, offset_vec, rnd_vec,
 
 4673                            dst0_r, dst1_r, dst0_l, dst1_l);
 
 4675                            weight_vec, offset_vec, rnd_vec,
 
 4676                            dst2_r, dst3_r, dst2_l, dst3_l);
 
 4678                            weight_vec, offset_vec, rnd_vec,
 
 4679                            dst4_r, dst5_r, dst4_l, dst5_l);
 
 4681                      dst2_l, dst2_r, dst3_l, dst3_r,
 
 4682                      dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
 
 4683     ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
 
 4684     dst += (4 * dst_stride);
 
 4692                                               const int8_t *filter_x,
 
 4693                                               const int8_t *filter_y,
 
 4700     uint32_t loop_cnt, cnt;
 
 4703     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 4705     v4i32 filt_h0, filt_h1;
 
 4706     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 4708     v8i16 filter_vec, const_vec;
 
 4709     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4710     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 4711     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 4712     v4i32 weight_vec, offset_vec, rnd_vec;
 
 4713     v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
 
 4714     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 4715     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 4717     src -= (src_stride + 1);
 
 4719     filter_vec = 
LD_SH(filter_x);
 
 4722     filter_vec = 
LD_SH(filter_y);
 
 4723     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 4724     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 4730     const_vec = __msa_ldi_h(128);
 
 4733     weight_vec = __msa_fill_w(weight);
 
 4734     offset_vec = __msa_fill_w(offset);
 
 4735     rnd_vec = __msa_fill_w(rnd_val);
 
 4737     for (cnt = width >> 3; cnt--;) {
 
 4741         LD_SB3(src_tmp, src_stride, src0, src1, src2);
 
 4742         src_tmp += (3 * src_stride);
 
 4745         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4746         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4747         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4758         for (loop_cnt = height >> 2; loop_cnt--;) {
 
 4759             LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
 
 4760             src_tmp += (4 * src_stride);
 
 4763             VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4772             VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 4781             VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 4790             VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 4800                                    weight_vec, offset_vec, rnd_vec,
 
 4801                                    dst0_r, dst1_r, dst0_l, dst1_l);
 
 4803                                    weight_vec, offset_vec, rnd_vec,
 
 4804                                    dst2_r, dst3_r, dst2_l, dst3_l);
 
 4806                             dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
 
 4807             ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
 
 4808             dst_tmp += (4 * dst_stride);
 
 4820                                      const int8_t *filter_x,
 
 4821                                      const int8_t *filter_y,
 
 4830                                   filter_x, filter_y, height, weight,
 
 4832     } 
else if (6 == height) {
 
 4834                                   filter_x, filter_y, height, weight,
 
 4836     } 
else if (0 == (height % 4)) {
 
 4838                                           filter_x, filter_y, height, weight,
 
 4839                                           offset, rnd_val, 8);
 
 4847                                       const int8_t *filter_x,
 
 4848                                       const int8_t *filter_y,
 
 4855                                       filter_x, filter_y, height, weight,
 
 4856                                       offset, rnd_val, 8);
 
 4858                              filter_x, filter_y, height, weight,
 
 4866                                       const int8_t *filter_x,
 
 4867                                       const int8_t *filter_y,
 
 4874                                       filter_x, filter_y, height, weight,
 
 4875                                       offset, rnd_val, 16);
 
 4882                                       const int8_t *filter_x,
 
 4883                                       const int8_t *filter_y,
 
 4890                                       filter_x, filter_y, height, weight,
 
 4891                                       offset, rnd_val, 24);
 
 4898                                       const int8_t *filter_x,
 
 4899                                       const int8_t *filter_y,
 
 4906                                       filter_x, filter_y, height, weight,
 
 4907                                       offset, rnd_val, 32);
 
 4910 #define UNIWGT_MC_COPY(WIDTH)                                                \ 
 4911 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \ 
 4912                                                       ptrdiff_t dst_stride,  \ 
 4914                                                       ptrdiff_t src_stride,  \ 
 4923     int shift = denom + 14 - 8;                                              \ 
 4924     hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \ 
 4925                                     height, weight, offset, shift);          \ 
 4938 #undef UNIWGT_MC_COPY 
 4940 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \ 
 4941 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \ 
 4955     const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \ 
 4956     int shift = denom + 14 - 8;                                               \ 
 4958     hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \ 
 4959                                                  dst_stride, filter, height,  \ 
 4960                                                  weight, offset, shift);      \ 
 4999 #define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                              \ 
 5000 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,       \ 
 5014     const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                \ 
 5015     const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                \ 
 5016     int shift = denom + 14 - 8;                                              \ 
 5018     hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,       \ 
 5019                                                  dst_stride, filter_x,       \ 
 5020                                                  filter_y,  height, weight,  \ 
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B8_128_SB(...)
static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B2_128_SB(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B3_128_SB(...)
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,out0_h, out1_h)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,out0_r, out1_r, out2_r, out3_r,out0_l, out1_l, out2_l, out3_l)
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD4(psrc, stride, out0, out1, out2, out3)
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, out0, out1, out2)
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define CLIP_SW_0_255(in)
static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define CLIP_SH_0_255_MAX_SATU(in)
#define XORI_B7_128_SB(...)
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LW2(psrc, stride, out0, out1)
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
#define XORI_B4_128_SB(...)
static const uint8_t offset[127][2]
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define DPADD_SB2_SH(...)
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W4_SW(...)
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static const uint8_t ff_hevc_mask_arr[16 *2]
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,out0, out1, out2, out3)
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static int weight(int i, int blen, int offset)
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNIWGT_MC_COPY(WIDTH)
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define INSERT_D2_SB(...)
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B6_128_SB(...)
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,offset_h, rnd_w, out0_h, out1_h,out2_h, out3_h)
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,out0_r, out1_r, out0_l, out1_l)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W2_SW(...)
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_2V(in0, in1, shift)
#define ST4x2_UB(in, pdst, stride)
#define INSERT_W4_SB(...)
#define LD2(psrc, stride, out0, out1)
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)