21 #include "libavcodec/hevc/dec.h" 
   26     -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
 
   30     32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
 
   33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \ 
   34                               mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \ 
   35                               res0, res1, mul_val_b0, mul_val_b1, round)       \ 
   37     v8i16 res0_m, res1_m, res2_m, res3_m;                                      \ 
   39     MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \ 
   40          mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \ 
   42     res0_m += mul_val_h1 * tmp0;                                               \ 
   43     res1_m += mul_val_h3 * tmp0;                                               \ 
   44     res2_m += mul_val_h1 * tmp0;                                               \ 
   45     res3_m += mul_val_h3 * tmp0;                                               \ 
   47     res0_m += mul_val_b0 * src0_r;                                             \ 
   48     res1_m += mul_val_b0 * src0_l;                                             \ 
   49     res2_m += (mul_val_b0 - 1) * src0_r;                                       \ 
   50     res3_m += (mul_val_b0 - 1) * src0_l;                                       \ 
   52     res0_m += mul_val_b1 * tmp1;                                               \ 
   53     res1_m += mul_val_b1 * tmp1;                                               \ 
   54     res2_m += (mul_val_b1 + 1) * tmp1;                                         \ 
   55     res3_m += (mul_val_b1 + 1) * tmp1;                                         \ 
   57     SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \ 
   58     PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \ 
   62                                          const uint8_t *src_left,
 
   68     v8i16 vec0, vec1, vec2;
 
   71     src_data = 
LW(src_top);
 
   72     SW4(src_data, src_data, src_data, src_data, 
dst, 
stride);
 
   75         src_data = 
LW(src_left);
 
   77         vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
 
   79         vec0 = __msa_fill_h(src_left[-1]);
 
   80         vec1 = __msa_fill_h(src_top[0]);
 
   82         vec2 = (v8i16) __msa_ilvr_b(
zero, (v16i8) vec2);
 
   88         for (col = 0; col < 4; col++) {
 
   95                                          const uint8_t *src_left,
 
   99     uint8_t *tmp_dst = 
dst;
 
  101     uint16_t val0, val1, val2, val3;
 
  103     v8i16 vec0, vec1, vec2;
 
  106     src_data1 = 
LD(src_top);
 
  108     for (row = 8; row--;) {
 
  109         SD(src_data1, tmp_dst);
 
  114         src_data1 = 
LD(src_left);
 
  116         vec2 = (v8i16) __msa_insert_d((v2i64) 
zero, 0, src_data1);
 
  118         vec0 = __msa_fill_h(src_left[-1]);
 
  119         vec1 = __msa_fill_h(src_top[0]);
 
  121         vec2 = (v8i16) __msa_ilvr_b(
zero, (v16i8) vec2);
 
  150                                            const uint8_t *src_left,
 
  155     uint8_t *tmp_dst = 
dst;
 
  158     v8i16 vec0, vec1, vec2, vec3;
 
  162     for (row = 16; row--;) {
 
  170         vec0 = __msa_fill_h(src_left[-1]);
 
  171         vec1 = __msa_fill_h(src_top[0]);
 
  174         SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
 
  179         ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
 
  182         src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
 
  184         for (col = 0; col < 16; col++) {
 
  191                                           const uint8_t *src_left,
 
  195     uint32_t val0, val1, val2, val3;
 
  197     v8i16 src0_r, src_top_val, src_left_val;
 
  200     val0 = src_left[0] * 0x01010101;
 
  201     val1 = src_left[1] * 0x01010101;
 
  202     val2 = src_left[2] * 0x01010101;
 
  203     val3 = src_left[3] * 0x01010101;
 
  208         src0 = (v16i8) __msa_insert_w((v4i32) 
src0, 0, val0);
 
  209         src_top_val = __msa_fill_h(src_top[-1]);
 
  210         src_left_val = __msa_fill_h(src_left[0]);
 
  212         src0_r = (v8i16) __msa_ilvr_b(
zero, 
src0);
 
  214         src0_r -= src_top_val;
 
  216         src0_r += src_left_val;
 
  218         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
 
  219         val0 = __msa_copy_s_w((v4i32) 
src0, 0);
 
  225                                           const uint8_t *src_left,
 
  229     uint64_t val0, val1, val2, val3;
 
  231     v8i16 src0_r, src_top_val, src_left_val;
 
  234     val0 = src_left[0] * 0x0101010101010101;
 
  235     val1 = src_left[1] * 0x0101010101010101;
 
  236     val2 = src_left[2] * 0x0101010101010101;
 
  237     val3 = src_left[3] * 0x0101010101010101;
 
  240     val0 = src_left[4] * 0x0101010101010101;
 
  241     val1 = src_left[5] * 0x0101010101010101;
 
  242     val2 = src_left[6] * 0x0101010101010101;
 
  243     val3 = src_left[7] * 0x0101010101010101;
 
  248         src0 = (v16i8) __msa_insert_d((v2i64) 
src0, 0, val0);
 
  249         src_top_val = __msa_fill_h(src_top[-1]);
 
  250         src_left_val = __msa_fill_h(src_left[0]);
 
  252         src0_r = (v8i16) __msa_ilvr_b(
zero, 
src0);
 
  254         src0_r -= src_top_val;
 
  256         src0_r += src_left_val;
 
  258         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
 
  259         val0 = __msa_copy_s_d((v2i64) 
src0, 0);
 
  265                                             const uint8_t *src_left,
 
  269     uint8_t *tmp_dst = 
dst;
 
  271     uint8_t inp0, inp1, inp2, inp3;
 
  273     v8i16 src0_r, src0_l, src_left_val, src_top_val;
 
  275     src_left_val = __msa_fill_h(src_left[0]);
 
  277     for (row = 4; row--;) {
 
  284         src0 = __msa_fill_b(inp0);
 
  285         src1 = __msa_fill_b(inp1);
 
  286         src2 = __msa_fill_b(inp2);
 
  287         src3 = __msa_fill_b(inp3);
 
  295         src_top_val = __msa_fill_h(src_top[-1]);
 
  298         SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
 
  303         ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
 
  305         src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
 
  311                                             const uint8_t *src_left,
 
  315     uint8_t inp0, inp1, inp2, inp3;
 
  318     for (row = 0; row < 8; row++) {
 
  319         inp0 = src_left[row * 4];
 
  320         inp1 = src_left[row * 4 + 1];
 
  321         inp2 = src_left[row * 4 + 2];
 
  322         inp3 = src_left[row * 4 + 3];
 
  324         src0 = __msa_fill_b(inp0);
 
  325         src1 = __msa_fill_b(inp1);
 
  326         src2 = __msa_fill_b(inp2);
 
  327         src3 = __msa_fill_b(inp3);
 
  341                                        const uint8_t *src_left,
 
  345     uint8_t *tmp_dst = 
dst;
 
  346     uint32_t addition = 0;
 
  347     uint32_t val0, val1, val2;
 
  351     v8u16 sum, vec0, vec1;
 
  356     sum = __msa_hadd_u_h((v16u8) 
src, (v16u8) 
src);
 
  357     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  358     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  359     sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
 
  360     addition = __msa_copy_u_w((v4i32) sum, 0);
 
  361     store = (v16u8) __msa_fill_b(addition);
 
  362     val0 = __msa_copy_u_w((v4i32) store, 0);
 
  372         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
 
  373         store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
 
  374         val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
 
  375         store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
 
  376         val0 = __msa_copy_u_w((v4i32) store, 0);
 
  385         ADD2(val0, addition, val1, addition, val0, val1);
 
  395         tmp_dst[
stride * 1] = val0;
 
  396         tmp_dst[
stride * 2] = val1;
 
  397         tmp_dst[
stride * 3] = val2;
 
  402                                        const uint8_t *src_left,
 
  406     uint8_t *tmp_dst = 
dst;
 
  407     uint32_t row, col, 
val;
 
  408     uint32_t addition = 0;
 
  412     v8u16 sum, vec0, vec1;
 
  418     sum = __msa_hadd_u_h((v16u8) 
src, (v16u8) 
src);
 
  419     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  420     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  421     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
 
  422     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  423     sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
 
  424     addition = __msa_copy_u_w((v4i32) sum, 0);
 
  425     store = (v16u8) __msa_fill_b(addition);
 
  426     val0 = __msa_copy_u_d((v2i64) store, 0);
 
  428     for (row = 8; row--;) {
 
  439         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
 
  440         store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
 
  441         val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
 
  442         store = (v16u8) __msa_insert_b((v16i8) store, 0, 
val);
 
  443         val0 = __msa_copy_u_d((v2i64) store, 0);
 
  447         src = (v16u8) __msa_insert_d((v2i64) 
src, 0, val0);
 
  448         vec1 = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
src);
 
  449         vec0 = (v8u16) __msa_fill_h(addition);
 
  452         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
 
  454         for (col = 1; col < 8; col++) {
 
  455             tmp_dst[
stride * col] = vec1[col];
 
  461                                          const uint8_t *src_left,
 
  465     uint8_t *tmp_dst = 
dst;
 
  466     uint32_t row, col, 
val;
 
  467     uint32_t addition = 0;
 
  468     v16u8 src_above1, store, src_left1;
 
  469     v8u16 sum, sum_above, sum_left;
 
  470     v8u16 vec0, vec1, vec2;
 
  473     src_above1 = 
LD_UB(src_top);
 
  474     src_left1 = 
LD_UB(src_left);
 
  476     HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
 
  477     sum = sum_above + sum_left;
 
  478     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  479     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  480     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
 
  481     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  482     sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
 
  483     addition = __msa_copy_u_w((v4i32) sum, 0);
 
  484     store = (v16u8) __msa_fill_b(addition);
 
  486     for (row = 16; row--;) {
 
  492         vec0 = (v8u16) __msa_ilvr_b(
zero, (v16i8) store);
 
  494         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
 
  496         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
 
  498         store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
 
  499         val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
 
  500         store = (v16u8) __msa_insert_b((v16i8) store, 0, 
val);
 
  501         ST_UB(store, tmp_dst);
 
  504         vec0 = (v8u16) __msa_fill_h(addition);
 
  506         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
 
  508         store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
 
  510         for (col = 1; col < 16; col++) {
 
  511             tmp_dst[
stride * col] = store[col];
 
  517                                          const uint8_t *src_left,
 
  521     v16u8 src_above1, src_above2, store, src_left1, src_left2;
 
  522     v8u16 sum_above1, sum_above2;
 
  523     v8u16 sum_left1, sum_left2;
 
  524     v8u16 sum, sum_above, sum_left;
 
  526     LD_UB2(src_top, 16, src_above1, src_above2);
 
  527     LD_UB2(src_left, 16, src_left1, src_left2);
 
  528     HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
 
  529     HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
 
  530     sum_above = sum_above1 + sum_above2;
 
  531     sum_left = sum_left1 + sum_left2;
 
  532     sum = sum_above + sum_left;
 
  533     sum = (v8u16) __msa_hadd_u_w(sum, sum);
 
  534     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  535     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
 
  536     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
 
  537     sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
 
  538     store = (v16u8) __msa_splati_b((v16i8) sum, 0);
 
  540     for (row = 16; row--;) {
 
  549                                           const uint8_t *src_left,
 
  553     v16i8 src_vec0, src_vec1;
 
  554     v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
 
  555     v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
 
  556     v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
 
  562     mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
 
  564     src_vec0 = (v16i8) __msa_insert_w((v4i32) 
zero, 0, 
src0);
 
  565     src_vec1 = (v16i8) __msa_insert_w((v4i32) 
zero, 0, 
src1);
 
  568     SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
 
  570     tmp0 = __msa_fill_h(src_top[4]);
 
  571     tmp1 = __msa_fill_h(src_left[4]);
 
  573     MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
 
  574          res0, res1, res2, res3);
 
  576     res0 += mul_val1 * tmp0;
 
  577     res1 += mul_val1 * tmp0;
 
  578     res2 += mul_val1 * tmp0;
 
  579     res3 += mul_val1 * tmp0;
 
  581     res0 += 3 * src_vec0_r;
 
  582     res1 += 2 * src_vec0_r;
 
  591     src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
  596                                           const uint8_t *src_left,
 
  600     v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
 
  601     v8i16 src_vec0_r, src_vec1_r;
 
  602     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  603     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  604     v8i16 tmp0, tmp1, tmp2;
 
  605     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  606     v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
 
  612     src_vec0 = (v16i8) __msa_insert_d((v2i64) 
zero, 0, 
src0);
 
  613     src_vec1 = (v16i8) __msa_insert_d((v2i64) 
zero, 0, 
src1);
 
  616     SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
 
  617     SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
 
  619     tmp0 = __msa_fill_h(src_top[8]);
 
  620     tmp1 = __msa_fill_h(src_left[8]);
 
  622     MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
 
  623          res0, res1, res2, res3);
 
  624     MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
 
  625          res4, res5, res6, res7);
 
  627     tmp2 = mul_val1 * tmp0;
 
  637     res0 += 7 * src_vec0_r;
 
  638     res1 += 6 * src_vec0_r;
 
  639     res2 += 5 * src_vec0_r;
 
  640     res3 += 4 * src_vec0_r;
 
  641     res4 += 3 * src_vec0_r;
 
  642     res5 += 2 * src_vec0_r;
 
  656     PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
 
  657                 src_vec0, src_vec1, src_vec2, src_vec3);
 
  659     ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
 
  664                                             const uint8_t *src_left,
 
  668     v8i16 src0_r, src1_r, src0_l, src1_l;
 
  670     v8i16 res0, res1, tmp0, tmp1;
 
  671     v8i16 mul_val2, mul_val3;
 
  672     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  673     v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
 
  681     mul_val2 = mul_val0 - 8;
 
  682     mul_val3 = mul_val1 + 8;
 
  684     tmp0 = __msa_fill_h(src_top[16]);
 
  685     tmp1 = __msa_fill_h(src_left[16]);
 
  689                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  690                           res0, res1, 15, 1, 5);
 
  696                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  697                           res0, res1, 13, 3, 5);
 
  703                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  704                           res0, res1, 11, 5, 5);
 
  710                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  711                           res0, res1, 9, 7, 5);
 
  717                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  718                           res0, res1, 7, 9, 5);
 
  724                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  725                           res0, res1, 5, 11, 5);
 
  731                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  732                           res0, res1, 3, 13, 5);
 
  738                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  739                           res0, res1, 1, 15, 5);
 
  744                                           const uint8_t *src_left,
 
  749     v8i16 src0_r, src1_r, src0_l, src1_l;
 
  750     v8i16 vec0, vec1, res0, res1;
 
  752     v8i16 mul_val2, mul_val3;
 
  753     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  754     v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
 
  756     tmp0 = __msa_fill_h(src_top[32 - 
offset]);
 
  757     tmp1 = __msa_fill_h(src_left[32]);
 
  767     mul_val2 = mul_val0 - 8;
 
  768     mul_val3 = mul_val1 + 8;
 
  772                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  773                           res0, res1, 31, 1, 6);
 
  779                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  780                           res0, res1, 29, 3, 6);
 
  786                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  787                           res0, res1, 27, 5, 6);
 
  793                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  794                           res0, res1, 25, 7, 6);
 
  800                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  801                           res0, res1, 23, 9, 6);
 
  807                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  808                           res0, res1, 21, 11, 6);
 
  814                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  815                           res0, res1, 19, 13, 6);
 
  821                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  822                           res0, res1, 17, 15, 6);
 
  827                                           const uint8_t *src_left,
 
  832     v8i16 src0_r, src1_r, src0_l, src1_l;
 
  833     v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
 
  834     v8i16 mul_val2, mul_val3;
 
  835     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
  836     v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
 
  838     tmp0 = __msa_fill_h(src_top[32 - 
offset]);
 
  839     tmp1 = __msa_fill_h(src_left[16]);
 
  849     mul_val2 = mul_val0 - 8;
 
  850     mul_val3 = mul_val1 + 8;
 
  854                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  855                           res0, res1, 15, 17, 6);
 
  861                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  862                           res0, res1, 13, 19, 6);
 
  868                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  869                           res0, res1, 11, 21, 6);
 
  875                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  876                           res0, res1, 9, 23, 6);
 
  882                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  883                           res0, res1, 7, 25, 6);
 
  889                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  890                           res0, res1, 5, 27, 6);
 
  896                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  897                           res0, res1, 3, 29, 6);
 
  903                           mul_val0, mul_val1, mul_val2, mul_val3,
 
  904                           res0, res1, 1, 31, 6);
 
  909                                             const uint8_t *src_left,
 
  924                                                      const uint8_t *src_left,
 
  929     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
  930     uint8_t ref_array[3 * 32 + 4];
 
  931     uint8_t *ref_tmp = ref_array + 4;
 
  934     int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
 
  935     int32_t idx2, fact_val2, idx3, fact_val3;
 
  939     v16i8 top0, top1, top2, top3;
 
  942     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
  943     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
  946     inv_angle_val = inv_angle[
mode - 18];
 
  951     if (angle < 0 && last < -1) {
 
  952         inv_angle_val = inv_angle[
mode - 18];
 
  957         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
  958             offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
 
  959             ref_tmp[h_cnt] = src_left[
offset];
 
  965     idx0 = angle_loop >> 5;
 
  966     fact_val0 = angle_loop & 31;
 
  969     idx1 = angle_loop >> 5;
 
  970     fact_val1 = angle_loop & 31;
 
  973     idx2 = angle_loop >> 5;
 
  974     fact_val2 = angle_loop & 31;
 
  977     idx3 = angle_loop >> 5;
 
  978     fact_val3 = angle_loop & 31;
 
  985     fact0 = __msa_fill_h(fact_val0);
 
  986     fact1 = __msa_fill_h(32 - fact_val0);
 
  988     fact2 = __msa_fill_h(fact_val1);
 
  989     fact3 = __msa_fill_h(32 - fact_val1);
 
  991     fact4 = __msa_fill_h(fact_val2);
 
  992     fact5 = __msa_fill_h(32 - fact_val2);
 
  994     fact6 = __msa_fill_h(fact_val3);
 
  995     fact7 = __msa_fill_h(32 - fact_val3);
 
  997     ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
 
  998     ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
 
 1000                diff0, diff2, diff4, diff6);
 
 1002                diff1, diff3, diff5, diff7);
 
 1003     ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
 
 1004     ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
 
 1005     MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
 
 1007     diff1 += diff0 * fact1;
 
 1008     diff3 += diff2 * fact3;
 
 1011     dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
 
 1016                                                      const uint8_t *src_left,
 
 1021     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
 1022     uint8_t ref_array[3 * 32 + 4];
 
 1023     uint8_t *ref_tmp = ref_array + 8;
 
 1025     const uint8_t *src_left_tmp = src_left - 1;
 
 1027     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1028     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1030     int32_t inv_angle_val, inv_angle_val_loop;
 
 1032     v16i8 top0, top1, top2, top3;
 
 1033     v16u8 dst_val0, dst_val1;
 
 1034     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1035     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1038     inv_angle_val = inv_angle[
mode - 18];
 
 1039     last = (angle) >> 2;
 
 1044         inv_angle_val_loop = inv_angle_val * last;
 
 1050         SW(tmp1, ref_tmp + 4);
 
 1051         SW(tmp2, ref_tmp + 8);
 
 1053         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1054             offset = (inv_angle_val_loop + 128) >> 8;
 
 1055             ref_tmp[h_cnt] = src_left_tmp[
offset];
 
 1056             inv_angle_val_loop += inv_angle_val;
 
 1061     for (v_cnt = 0; v_cnt < 2; v_cnt++) {
 
 1062         idx0 = (angle_loop) >> 5;
 
 1063         fact_val0 = (angle_loop) & 31;
 
 1064         angle_loop += angle;
 
 1066         idx1 = (angle_loop) >> 5;
 
 1067         fact_val1 = (angle_loop) & 31;
 
 1068         angle_loop += angle;
 
 1070         idx2 = (angle_loop) >> 5;
 
 1071         fact_val2 = (angle_loop) & 31;
 
 1072         angle_loop += angle;
 
 1074         idx3 = (angle_loop) >> 5;
 
 1075         fact_val3 = (angle_loop) & 31;
 
 1076         angle_loop += angle;
 
 1083         fact0 = __msa_fill_h(fact_val0);
 
 1084         fact1 = __msa_fill_h(32 - fact_val0);
 
 1085         fact2 = __msa_fill_h(fact_val1);
 
 1086         fact3 = __msa_fill_h(32 - fact_val1);
 
 1087         fact4 = __msa_fill_h(fact_val2);
 
 1088         fact5 = __msa_fill_h(32 - fact_val2);
 
 1089         fact6 = __msa_fill_h(fact_val3);
 
 1090         fact7 = __msa_fill_h(32 - fact_val3);
 
 1097         SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
 
 1098                    diff1, diff3, diff5, diff7);
 
 1099         MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
 
 1100              diff1, diff3, diff5, diff7);
 
 1102         diff1 += diff0 * fact1;
 
 1103         diff3 += diff2 * fact3;
 
 1104         diff5 += diff4 * fact5;
 
 1105         diff7 += diff6 * fact7;
 
 1108         PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
 
 1115                                                       const uint8_t *src_left,
 
 1120     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
 1121     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1122     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1125     int32_t inv_angle_val, inv_angle_val_loop;
 
 1126     uint8_t ref_array[3 * 32 + 4];
 
 1127     uint8_t *ref_tmp = ref_array + 16;
 
 1129     const uint8_t *src_left_tmp = src_left - 1;
 
 1131     v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
 
 1132     v16i8 dst0, dst1, dst2, dst3;
 
 1133     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1134     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1135     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1138     inv_angle_val = inv_angle[
mode - 18];
 
 1144         inv_angle_val_loop = inv_angle_val * last;
 
 1147         tmp0 = 
LW(
ref + 16);
 
 1148         ST_UB(top0, ref_tmp);
 
 1149         SW(tmp0, ref_tmp + 16);
 
 1151         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1152             offset = (inv_angle_val_loop + 128) >> 8;
 
 1153             ref_tmp[h_cnt] = src_left_tmp[
offset];
 
 1154             inv_angle_val_loop += inv_angle_val;
 
 1159     for (v_cnt = 4; v_cnt--;) {
 
 1160         idx0 = (angle_loop) >> 5;
 
 1161         fact_val0 = (angle_loop) & 31;
 
 1162         angle_loop += angle;
 
 1164         idx1 = (angle_loop) >> 5;
 
 1165         fact_val1 = (angle_loop) & 31;
 
 1166         angle_loop += angle;
 
 1168         idx2 = (angle_loop) >> 5;
 
 1169         fact_val2 = (angle_loop) & 31;
 
 1170         angle_loop += angle;
 
 1172         idx3 = (angle_loop) >> 5;
 
 1173         fact_val3 = (angle_loop) & 31;
 
 1174         angle_loop += angle;
 
 1181         fact0 = __msa_fill_h(fact_val0);
 
 1182         fact1 = __msa_fill_h(32 - fact_val0);
 
 1183         fact2 = __msa_fill_h(fact_val1);
 
 1184         fact3 = __msa_fill_h(32 - fact_val1);
 
 1185         fact4 = __msa_fill_h(fact_val2);
 
 1186         fact5 = __msa_fill_h(32 - fact_val2);
 
 1187         fact6 = __msa_fill_h(fact_val3);
 
 1188         fact7 = __msa_fill_h(32 - fact_val3);
 
 1190         SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
 
 1191                    top1, top3, top5, top7);
 
 1201         MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
 
 1202              diff2, diff3, diff6, diff7);
 
 1203         MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
 
 1204              diff10, diff11, diff14, diff15);
 
 1206         diff2 += diff0 * fact1;
 
 1207         diff3 += diff1 * fact1;
 
 1208         diff6 += diff4 * fact3;
 
 1209         diff7 += diff5 * fact3;
 
 1210         diff10 += diff8 * fact5;
 
 1211         diff11 += diff9 * fact5;
 
 1212         diff14 += diff12 * fact7;
 
 1213         diff15 += diff13 * fact7;
 
 1217         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1218                     dst0, dst1, dst2, dst3);
 
 1225                                                       const uint8_t *src_left,
 
 1230     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
 
 1231     uint8_t ref_array[3 * 32 + 4];
 
 1234     const uint8_t *src_left_tmp = src_left - 1;
 
 1235     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1236     int32_t tmp0, tmp1, tmp2, tmp3;
 
 1238     int32_t inv_angle_val, inv_angle_val_loop;
 
 1240     v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
 
 1241     v16i8 dst0, dst1, dst2, dst3;
 
 1242     v8i16 fact0, fact1, fact2, fact3;
 
 1243     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1244     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1246     ref_tmp = ref_array + 32;
 
 1249     inv_angle_val = inv_angle[
mode - 18];
 
 1255         inv_angle_val_loop = inv_angle_val * last;
 
 1262         ST_UB2(top0, top1, ref_tmp, 16);
 
 1268         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1269             offset = (inv_angle_val_loop + 128) >> 8;
 
 1270             ref_tmp[h_cnt] = src_left_tmp[
offset];
 
 1271             inv_angle_val_loop += inv_angle_val;
 
 1277     for (v_cnt = 16; v_cnt--;) {
 
 1278         idx0 = (angle_loop) >> 5;
 
 1279         fact_val0 = (angle_loop) & 31;
 
 1280         angle_loop += angle;
 
 1282         idx1 = (angle_loop) >> 5;
 
 1283         fact_val1 = (angle_loop) & 31;
 
 1284         angle_loop += angle;
 
 1293         fact0 = __msa_fill_h(fact_val0);
 
 1294         fact1 = __msa_fill_h(32 - fact_val0);
 
 1295         fact2 = __msa_fill_h(fact_val1);
 
 1296         fact3 = __msa_fill_h(32 - fact_val1);
 
 1301         SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
 
 1302                    top1, top3, top5, top7);
 
 1312         MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
 
 1313              diff2, diff3, diff6, diff7);
 
 1314         MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
 
 1315              diff10, diff11, diff14, diff15);
 
 1317         diff2 += diff0 * fact1;
 
 1318         diff3 += diff1 * fact1;
 
 1319         diff6 += diff4 * fact1;
 
 1320         diff7 += diff5 * fact1;
 
 1321         diff10 += diff8 * fact3;
 
 1322         diff11 += diff9 * fact3;
 
 1323         diff14 += diff12 * fact3;
 
 1324         diff15 += diff13 * fact3;
 
 1328         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1329                     dst0, dst1, dst2, dst3);
 
 1339                                                      const uint8_t *src_left,
 
 1344     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1345     uint8_t ref_array[3 * 32 + 4];
 
 1346     uint8_t *ref_tmp = ref_array + 4;
 
 1349     int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1350     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1351     int32_t angle, angle_loop, inv_angle_val;
 
 1353     v16i8 dst_val0, dst_val1;
 
 1354     v16u8 top0, top1, top2, top3;
 
 1356     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1357     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1365         inv_angle_val = inv_angle[
mode - 11];
 
 1370         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1371             offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
 
 1372             ref_tmp[h_cnt] = src_top[
offset];
 
 1378     idx0 = angle_loop >> 5;
 
 1379     fact_val0 = angle_loop & 31;
 
 1380     angle_loop += angle;
 
 1382     idx1 = angle_loop >> 5;
 
 1383     fact_val1 = angle_loop & 31;
 
 1384     angle_loop += angle;
 
 1386     idx2 = angle_loop >> 5;
 
 1387     fact_val2 = angle_loop & 31;
 
 1388     angle_loop += angle;
 
 1390     idx3 = angle_loop >> 5;
 
 1391     fact_val3 = angle_loop & 31;
 
 1398     fact0 = __msa_fill_h(fact_val0);
 
 1399     fact1 = __msa_fill_h(32 - fact_val0);
 
 1400     fact2 = __msa_fill_h(fact_val1);
 
 1401     fact3 = __msa_fill_h(32 - fact_val1);
 
 1402     fact4 = __msa_fill_h(fact_val2);
 
 1403     fact5 = __msa_fill_h(32 - fact_val2);
 
 1404     fact6 = __msa_fill_h(fact_val3);
 
 1405     fact7 = __msa_fill_h(32 - fact_val3);
 
 1407     ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
 
 1408     ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
 
 1410                diff0, diff2, diff4, diff6);
 
 1412                diff1, diff3, diff5, diff7);
 
 1413     ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
 
 1414     ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
 
 1415     MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
 
 1417     diff1 += diff0 * fact1;
 
 1418     diff3 += diff2 * fact3;
 
 1421     PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
 
 1423     diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
 
 1424     diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
 
 1426     diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
 
 1428     dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
 
 1429     dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
 
 1436                                                      const uint8_t *src_left,
 
 1441     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1442     uint8_t ref_array[3 * 32 + 4];
 
 1443     uint8_t *ref_tmp = ref_array + 8;
 
 1445     const uint8_t *src_top_tmp = src_top - 1;
 
 1448     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1449     int32_t idx2, fact_val2, idx3, fact_val3;
 
 1450     int32_t angle, angle_loop, inv_angle_val;
 
 1451     v16i8 top0, top1, top2, top3;
 
 1452     v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
 
 1453     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1454     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1457     last = (angle) >> 2;
 
 1462         inv_angle_val = inv_angle[
mode - 11];
 
 1468         SW(tmp1, ref_tmp + 4);
 
 1469         SW(tmp2, ref_tmp + 8);
 
 1471         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1472             offset = (h_cnt * inv_angle_val + 128) >> 8;
 
 1473             ref_tmp[h_cnt] = src_top_tmp[
offset];
 
 1479     for (v_cnt = 0; v_cnt < 2; v_cnt++) {
 
 1482         idx0 = angle_loop >> 5;
 
 1483         fact_val0 = angle_loop & 31;
 
 1484         angle_loop += angle;
 
 1486         idx1 = angle_loop >> 5;
 
 1487         fact_val1 = angle_loop & 31;
 
 1488         angle_loop += angle;
 
 1490         idx2 = angle_loop >> 5;
 
 1491         fact_val2 = angle_loop & 31;
 
 1492         angle_loop += angle;
 
 1494         idx3 = angle_loop >> 5;
 
 1495         fact_val3 = angle_loop & 31;
 
 1496         angle_loop += angle;
 
 1503         fact0 = __msa_fill_h(fact_val0);
 
 1504         fact1 = __msa_fill_h(32 - fact_val0);
 
 1505         fact2 = __msa_fill_h(fact_val1);
 
 1506         fact3 = __msa_fill_h(32 - fact_val1);
 
 1507         fact4 = __msa_fill_h(fact_val2);
 
 1508         fact5 = __msa_fill_h(32 - fact_val2);
 
 1509         fact6 = __msa_fill_h(fact_val3);
 
 1510         fact7 = __msa_fill_h(32 - fact_val3);
 
 1516         SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
 
 1517                    diff1, diff3, diff5, diff7);
 
 1518         MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
 
 1519              diff1, diff3, diff5, diff7);
 
 1521         diff1 += diff0 * fact1;
 
 1522         diff3 += diff2 * fact3;
 
 1523         diff5 += diff4 * fact5;
 
 1524         diff7 += diff6 * fact7;
 
 1527         PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
 
 1528                     dst_val0, dst_val1, dst_val2, dst_val3);
 
 1529         ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
 
 1531         ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, 
stride);
 
 1537                                                       const uint8_t *src_left,
 
 1542     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1543     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
 
 1544     int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
 
 1545     v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
 
 1546     v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
 
 1547     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
 
 1548     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1549     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1551     uint8_t ref_array[3 * 32 + 4];
 
 1552     uint8_t *ref_tmp = ref_array + 16;
 
 1553     const uint8_t *
ref, *src_top_tmp = src_top - 1;
 
 1558     last = (angle) >> 1;
 
 1563         inv_angle_val = inv_angle[
mode - 11];
 
 1566         tmp0 = 
LW(
ref + 16);
 
 1567         ST_SB(top0, ref_tmp);
 
 1568         SW(tmp0, ref_tmp + 16);
 
 1570         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1571             offset = (h_cnt * inv_angle_val + 128) >> 8;
 
 1572             ref_tmp[h_cnt] = src_top_tmp[
offset];
 
 1578     for (v_cnt = 0; v_cnt < 4; v_cnt++) {
 
 1581         idx0 = angle_loop >> 5;
 
 1582         fact_val0 = angle_loop & 31;
 
 1583         angle_loop += angle;
 
 1585         idx1 = angle_loop >> 5;
 
 1586         fact_val1 = angle_loop & 31;
 
 1587         angle_loop += angle;
 
 1589         idx2 = angle_loop >> 5;
 
 1590         fact_val2 = angle_loop & 31;
 
 1591         angle_loop += angle;
 
 1593         idx3 = angle_loop >> 5;
 
 1594         fact_val3 = angle_loop & 31;
 
 1595         angle_loop += angle;
 
 1602         fact0 = __msa_fill_h(fact_val0);
 
 1603         fact1 = __msa_fill_h(32 - fact_val0);
 
 1604         fact2 = __msa_fill_h(fact_val1);
 
 1605         fact3 = __msa_fill_h(32 - fact_val1);
 
 1606         fact4 = __msa_fill_h(fact_val2);
 
 1607         fact5 = __msa_fill_h(32 - fact_val2);
 
 1608         fact6 = __msa_fill_h(fact_val3);
 
 1609         fact7 = __msa_fill_h(32 - fact_val3);
 
 1611         SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
 
 1612                    top1, top3, top5, top7);
 
 1623         MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
 
 1624              diff2, diff3, diff6, diff7);
 
 1625         MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
 
 1626              diff10, diff11, diff14, diff15);
 
 1628         diff2 += diff0 * fact1;
 
 1629         diff3 += diff1 * fact1;
 
 1630         diff6 += diff4 * fact3;
 
 1631         diff7 += diff5 * fact3;
 
 1632         diff10 += diff8 * fact5;
 
 1633         diff11 += diff9 * fact5;
 
 1634         diff14 += diff12 * fact7;
 
 1635         diff15 += diff13 * fact7;
 
 1639         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1640                     dst_val0, dst_val1, dst_val2, dst_val3);
 
 1641         ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
 
 1642         ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
 
 1645         ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, 
stride);
 
 1647         ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, 
stride);
 
 1653                                                       const uint8_t *src_left,
 
 1658     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
 
 1659     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
 
 1660     v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
 
 1661     v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
 
 1662     v8i16 fact0, fact1, fact2, fact3;
 
 1663     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
 
 1664     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
 
 1666     uint8_t ref_array[3 * 32 + 4];
 
 1667     uint8_t *ref_tmp = ref_array + 32;
 
 1668     const uint8_t *
ref, *src_top_tmp = src_top - 1;
 
 1678         inv_angle_val = inv_angle[
mode - 11];
 
 1681         tmp0 = 
LW(
ref + 32);
 
 1682         ST_SB2(top0, top1, ref_tmp, 16);
 
 1683         SW(tmp0, ref_tmp + 32);
 
 1685         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
 
 1686             offset = (h_cnt * inv_angle_val + 128) >> 8;
 
 1687             ref_tmp[h_cnt] = src_top_tmp[
offset];
 
 1693     for (v_cnt = 0; v_cnt < 16; v_cnt++) {
 
 1695         idx0 = angle_loop >> 5;
 
 1696         fact_val0 = angle_loop & 31;
 
 1697         angle_loop += angle;
 
 1699         idx1 = angle_loop >> 5;
 
 1700         fact_val1 = angle_loop & 31;
 
 1701         angle_loop += angle;
 
 1710         fact0 = __msa_fill_h(fact_val0);
 
 1711         fact1 = __msa_fill_h(32 - fact_val0);
 
 1712         fact2 = __msa_fill_h(fact_val1);
 
 1713         fact3 = __msa_fill_h(32 - fact_val1);
 
 1718         SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
 
 1719                    top1, top3, top5, top7);
 
 1730         MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
 
 1731              diff2, diff3, diff6, diff7);
 
 1732         MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
 
 1733              diff10, diff11, diff14, diff15);
 
 1735         diff2 += diff0 * fact1;
 
 1736         diff3 += diff1 * fact1;
 
 1737         diff6 += diff4 * fact1;
 
 1738         diff7 += diff5 * fact1;
 
 1739         diff10 += diff8 * fact3;
 
 1740         diff11 += diff9 * fact3;
 
 1741         diff14 += diff12 * fact3;
 
 1742         diff15 += diff13 * fact3;
 
 1746         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
 
 1747                     dst_val0, dst_val1, dst_val2, dst_val3);
 
 1751         ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, 
stride)
 
 1753         ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, 
stride)
 
 1755         ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, 
stride)
 
 1757         ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, 
stride)
 
 1773     for (row = 32; row--;) {
 
 1780                                      const uint8_t *src_top,
 
 1781                                      const uint8_t *src_left,
 
 1788                                      const uint8_t *src_top,
 
 1789                                      const uint8_t *src_left,
 
 1796                                      const uint8_t *src_top,
 
 1797                                      const uint8_t *src_left,
 
 1804                                      const uint8_t *src_top,
 
 1805                                      const uint8_t *src_left,
 
 1812                                const uint8_t *src_left,
 
 1835                                       const uint8_t *src_top,
 
 1836                                       const uint8_t *src_left,
 
 1841     } 
else if (
mode == 26) {
 
 1843     } 
else if (
mode >= 18) {
 
 1853                                       const uint8_t *src_top,
 
 1854                                       const uint8_t *src_left,
 
 1859     } 
else if (
mode == 26) {
 
 1861     } 
else if (
mode >= 18) {
 
 1871                                       const uint8_t *src_top,
 
 1872                                       const uint8_t *src_left,
 
 1877     } 
else if (
mode == 26) {
 
 1879     } 
else if (
mode >= 18) {
 
 1889                                       const uint8_t *src_top,
 
 1890                                       const uint8_t *src_left,
 
 1895     } 
else if (
mode == 26) {
 
 1897     } 
else if (
mode >= 18) {
 
 1907                                int x0, 
int y0, 
int c_idx)
 
 1913     int hshift = 
sps->hshift[c_idx];
 
 1914     int vshift = 
sps->vshift[c_idx];
 
 1915     int size_in_luma_h = 16 << hshift;
 
 1916     int size_in_tbs_h = size_in_luma_h >> 
sps->log2_min_tb_size;
 
 1917     int size_in_luma_v = 16 << vshift;
 
 1918     int size_in_tbs_v = size_in_luma_v >> 
sps->log2_min_tb_size;
 
 1919     int x = x0 >> hshift;
 
 1920     int y = y0 >> vshift;
 
 1921     int x_tb = (x0 >> 
sps->log2_min_tb_size) & 
sps->tb_mask;
 
 1922     int y_tb = (y0 >> 
sps->log2_min_tb_size) & 
sps->tb_mask;
 
 1925         pps->min_tb_addr_zs[(y_tb) * (
sps->tb_mask + 2) + (x_tb)];
 
 1927     ptrdiff_t 
stride = 
s->frame->linesize[c_idx] / 
sizeof(uint8_t);
 
 1928     uint8_t *
src = (uint8_t *) 
s->frame->data[c_idx] + x + y * 
stride;
 
 1930     int min_pu_width = 
sps->min_pu_width;
 
 1935     uint8_t left_array[2 * 32 + 1];
 
 1936     uint8_t filtered_left_array[2 * 32 + 1];
 
 1937     uint8_t top_array[2 * 32 + 1];
 
 1938     uint8_t filtered_top_array[2 * 32 + 1];
 
 1940     uint8_t *
left = left_array + 1;
 
 1941     uint8_t *top = top_array + 1;
 
 1942     uint8_t *filtered_left = filtered_left_array + 1;
 
 1943     uint8_t *filtered_top = filtered_top_array + 1;
 
 1946         pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & 
sps->tb_mask) *
 
 1947                                (
sps->tb_mask + 2) + (x_tb - 1)];
 
 1953         pps->min_tb_addr_zs[(y_tb - 1) * (
sps->tb_mask + 2) +
 
 1954                                ((x_tb + size_in_tbs_h) & 
sps->tb_mask)];
 
 1956     int bottom_left_size =
 
 1957         (((y0 + 2 * size_in_luma_v) >
 
 1958           (
sps->height) ? (
sps->height) : (y0 +
 
 1959                                                  2 * size_in_luma_v)) -
 
 1960          (y0 + size_in_luma_v)) >> vshift;
 
 1961     int top_right_size =
 
 1962         (((x0 + 2 * size_in_luma_h) >
 
 1963           (
sps->width) ? (
sps->width) : (x0 + 2 * size_in_luma_h)) -
 
 1964          (x0 + size_in_luma_h)) >> hshift;
 
 1966     if (
pps->constrained_intra_pred_flag == 1) {
 
 1967         int size_in_luma_pu_v = ((size_in_luma_v) >> 
sps->log2_min_pu_size);
 
 1968         int size_in_luma_pu_h = ((size_in_luma_h) >> 
sps->log2_min_pu_size);
 
 1969         int on_pu_edge_x = !(x0 & ((1 << 
sps->log2_min_pu_size) - 1));
 
 1970         int on_pu_edge_y = !(y0 & ((1 << 
sps->log2_min_pu_size) - 1));
 
 1971         if (!size_in_luma_pu_h)
 
 1972             size_in_luma_pu_h++;
 
 1973         if (cand_bottom_left == 1 && on_pu_edge_x) {
 
 1974             int x_left_pu = ((x0 - 1) >> 
sps->log2_min_pu_size);
 
 1976                 ((y0 + size_in_luma_v) >> 
sps->log2_min_pu_size);
 
 1978                 ((size_in_luma_pu_v) >
 
 1979                  (
sps->min_pu_height -
 
 1980                   y_bottom_pu) ? (
sps->min_pu_height -
 
 1981                                   y_bottom_pu) : (size_in_luma_pu_v));
 
 1982             cand_bottom_left = 0;
 
 1983             for (
i = 0; 
i < 
max; 
i += 2)
 
 1985                     ((
s->cur_frame->tab_mvf[(x_left_pu) +
 
 1987                                        i) * min_pu_width]).pred_flag ==
 
 1990         if (cand_left == 1 && on_pu_edge_x) {
 
 1991             int x_left_pu = ((x0 - 1) >> 
sps->log2_min_pu_size);
 
 1992             int y_left_pu = ((y0) >> 
sps->log2_min_pu_size);
 
 1994                 ((size_in_luma_pu_v) >
 
 1995                  (
sps->min_pu_height -
 
 1996                   y_left_pu) ? (
sps->min_pu_height -
 
 1997                                 y_left_pu) : (size_in_luma_pu_v));
 
 1999             for (
i = 0; 
i < 
max; 
i += 2)
 
 2001                     ((
s->cur_frame->tab_mvf[(x_left_pu) +
 
 2003                                        i) * min_pu_width]).pred_flag ==
 
 2006         if (cand_up_left == 1) {
 
 2007             int x_left_pu = ((x0 - 1) >> 
sps->log2_min_pu_size);
 
 2008             int y_top_pu = ((y0 - 1) >> 
sps->log2_min_pu_size);
 
 2010                 (
s->cur_frame->tab_mvf[(x_left_pu) +
 
 2011                                  (y_top_pu) * min_pu_width]).pred_flag ==
 
 2014         if (cand_up == 1 && on_pu_edge_y) {
 
 2015             int x_top_pu = ((x0) >> 
sps->log2_min_pu_size);
 
 2016             int y_top_pu = ((y0 - 1) >> 
sps->log2_min_pu_size);
 
 2018                 ((size_in_luma_pu_h) >
 
 2019                  (
sps->min_pu_width -
 
 2020                   x_top_pu) ? (
sps->min_pu_width -
 
 2021                                x_top_pu) : (size_in_luma_pu_h));
 
 2023             for (
i = 0; 
i < 
max; 
i += 2)
 
 2025                     ((
s->cur_frame->tab_mvf[(x_top_pu + 
i) +
 
 2027                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2029         if (cand_up_right == 1 && on_pu_edge_y) {
 
 2030             int y_top_pu = ((y0 - 1) >> 
sps->log2_min_pu_size);
 
 2032                 ((x0 + size_in_luma_h) >> 
sps->log2_min_pu_size);
 
 2034                 ((size_in_luma_pu_h) >
 
 2035                  (
sps->min_pu_width -
 
 2036                   x_right_pu) ? (
sps->min_pu_width -
 
 2037                                  x_right_pu) : (size_in_luma_pu_h));
 
 2039             for (
i = 0; 
i < 
max; 
i += 2)
 
 2041                     ((
s->cur_frame->tab_mvf[(x_right_pu + 
i) +
 
 2043                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2046         vec0 = (v16u8) __msa_ldi_b(128);
 
 2050         ST_UB4(vec0, vec0, vec0, vec0, top, 16);
 
 2062     if (cand_up_right) {
 
 2064         ST_UB(vec0, (top + 16));
 
 2068                 ((
src[(16 + top_right_size - 1) + 
stride * (-1)]) *
 
 2070             for (
i = 0; 
i < (16 - top_right_size); 
i += 4)
 
 2076         for (
i = 0; 
i < 16; 
i++)
 
 2078     if (cand_bottom_left) {
 
 2079         for (
i = 16; 
i < 16 + bottom_left_size; 
i++)
 
 2083                 ((
src[(-1) + 
stride * (16 + bottom_left_size - 1)]) *
 
 2085             for (
i = 0; 
i < (16 - bottom_left_size); 
i += 4)
 
 2091     if (
pps->constrained_intra_pred_flag == 1) {
 
 2092         if (cand_bottom_left || cand_left || cand_up_left || cand_up
 
 2095                 x0 + ((2 * 16) << hshift) <
 
 2096                 sps->width ? 2 * 16 : (
sps->width - x0) >> hshift;
 
 2098                 y0 + ((2 * 16) << vshift) <
 
 2099                 sps->height ? 2 * 16 : (
sps->height - y0) >> vshift;
 
 2100             int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
 
 2101             if (!cand_up_right) {
 
 2102                 size_max_x = x0 + ((16) << hshift) < 
sps->width ?
 
 2103                     16 : (
sps->width - x0) >> hshift;
 
 2105             if (!cand_bottom_left) {
 
 2106                 size_max_y = y0 + ((16) << vshift) < 
sps->height ?
 
 2107                     16 : (
sps->height - y0) >> vshift;
 
 2109             if (cand_bottom_left || cand_left || cand_up_left) {
 
 2112                        !((
s->cur_frame->tab_mvf[(((x0 +
 
 2113                                              ((-1) << hshift)) >> 
sps->
 
 2114                                             log2_min_pu_size)) + (((y0 +
 
 2119                                           * min_pu_width]).pred_flag ==
 
 2123                     ((
s->cur_frame->tab_mvf[(((x0 +
 
 2124                                          ((-1) << hshift)) >> 
sps->
 
 2125                                         log2_min_pu_size)) + (((y0 + ((j)
 
 2130                                       * min_pu_width]).pred_flag == 
PF_INTRA)) {
 
 2132                     while (j < size_max_x
 
 2134                            !((
s->cur_frame->tab_mvf[(((x0 +
 
 2135                                                  ((j) << hshift)) >> 
sps->
 
 2136                                                 log2_min_pu_size)) + (((y0 +
 
 2141                                               * min_pu_width]).pred_flag ==
 
 2144                     for (
i = j; 
i > (j) - (j + 1); 
i--)
 
 2146                             ((
s->cur_frame->tab_mvf[(((x0 +
 
 2148                                                    1) << hshift)) >> 
sps->
 
 2149                                                 log2_min_pu_size)) + (((y0 +
 
 2154                                               * min_pu_width]).pred_flag ==
 
 2156                             top[
i - 1] = top[
i];
 
 2161                 while (j < size_max_x
 
 2163                        !((
s->cur_frame->tab_mvf[(((x0 +
 
 2164                                              ((j) << hshift)) >> 
sps->
 
 2165                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2170                                           * min_pu_width]).pred_flag ==
 
 2175                         for (
i = j; 
i > (j) - (j + 1); 
i--)
 
 2177                                 ((
s->cur_frame->tab_mvf[(((x0 +
 
 2180                                                     sps->log2_min_pu_size))
 
 2184                                                       sps->log2_min_pu_size))
 
 2186                                                   min_pu_width]).pred_flag ==
 
 2188                                 top[
i - 1] = top[
i];
 
 2190                         for (
i = j; 
i > (j) - (j); 
i--)
 
 2192                                 ((
s->cur_frame->tab_mvf[(((x0 +
 
 2195                                                     sps->log2_min_pu_size))
 
 2199                                                       sps->log2_min_pu_size))
 
 2201                                                   min_pu_width]).pred_flag ==
 
 2203                                 top[
i - 1] = top[
i];
 
 2209             if (cand_bottom_left || cand_left) {
 
 2210                 a = ((
left[-1]) * 0x01010101U);
 
 2211                 for (
i = 0; 
i < (0) + (size_max_y); 
i += 4)
 
 2213                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2214                                              ((-1) << hshift)) >> 
sps->
 
 2215                                             log2_min_pu_size)) + (((y0 +
 
 2220                                           * min_pu_width]).pred_flag ==
 
 2224                         a = ((
left[
i + 3]) * 0x01010101U);
 
 2227                 vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2231             if (!cand_bottom_left) {
 
 2233                 vec0 = (v16u8) __msa_fill_b(
left[15]);
 
 2237             if (x0 != 0 && y0 != 0) {
 
 2238                 a = ((
left[size_max_y - 1]) * 0x01010101U);
 
 2239                 for (
i = (size_max_y - 1);
 
 2240                      i > (size_max_y - 1) - (size_max_y); 
i -= 4)
 
 2242                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2243                                              ((-1) << hshift)) >> 
sps->
 
 2244                                             log2_min_pu_size)) + (((y0 +
 
 2250                                           * min_pu_width]).pred_flag ==
 
 2254                         a = ((
left[
i - 3]) * 0x01010101U);
 
 2256                     ((
s->cur_frame->tab_mvf[(((x0 +
 
 2257                                          ((-1) << hshift)) >> 
sps->
 
 2258                                         log2_min_pu_size)) + (((y0 + ((-1)
 
 2263                                       * min_pu_width]).pred_flag == 
PF_INTRA))
 
 2265             } 
else if (x0 == 0) {
 
 2267                     uint32_t 
pix = ((0) * 0x01010101U);
 
 2268                     for (
i = 0; 
i < (size_max_y); 
i += 4)
 
 2272                 a = ((
left[size_max_y - 1]) * 0x01010101U);
 
 2273                 for (
i = (size_max_y - 1);
 
 2274                      i > (size_max_y - 1) - (size_max_y); 
i -= 4)
 
 2276                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2277                                              ((-1) << hshift)) >> 
sps->
 
 2278                                             log2_min_pu_size)) + (((y0 +
 
 2284                                           * min_pu_width]).pred_flag ==
 
 2288                         a = ((
left[
i - 3]) * 0x01010101U);
 
 2292                 a = ((
left[-1]) * 0x01010101U);
 
 2293                 for (
i = 0; 
i < (0) + (size_max_x); 
i += 4)
 
 2295                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2296                                              ((
i) << hshift)) >> 
sps->
 
 2297                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2302                                           * min_pu_width]).pred_flag ==
 
 2306                         a = ((top[
i + 3]) * 0x01010101U);
 
 2311     if (!cand_bottom_left) {
 
 2313             vec0 = (v16u8) __msa_fill_b(
left[15]);
 
 2317         } 
else if (cand_up_left) {
 
 2318             vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2323         } 
else if (cand_up) {
 
 2326             vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2332         } 
else if (cand_up_right) {
 
 2333             vec0 = (v16u8) __msa_fill_b(top[16]);
 
 2346             vec0 = (v16u8) __msa_ldi_b(128);
 
 2348             ST_UB2(vec0, vec0, top, 16);
 
 2354         vec0 = (v16u8) __msa_fill_b(
left[16]);
 
 2357     if (!cand_up_left) {
 
 2361         vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2364     if (!cand_up_right) {
 
 2365         vec0 = (v16u8) __msa_fill_b(top[15]);
 
 2366         ST_UB(vec0, (top + 16));
 
 2372     if (!
sps->intra_smoothing_disabled
 
 2373         && (c_idx == 0 || 
sps->chroma_format_idc == 3)) {
 
 2375             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
 
 2376             int min_dist_vert_hor =
 
 2377                 (((((int) (
mode - 26
U)) >=
 
 2378                    0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
 
 2379                  ((((int) (
mode - 10
U)) >=
 
 2380                    0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2381                  ? ((((int) (
mode - 10
U)) >=
 
 2382                      0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2383                  : ((((int) (
mode - 26
U)) >=
 
 2384                      0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
 
 2385             if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
 
 2386                 filtered_left[2 * 16 - 1] = 
left[2 * 16 - 1];
 
 2387                 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
 
 2388                 for (
i = 2 * 16 - 2; 
i >= 0; 
i--)
 
 2390                                         left[
i - 1] + 2) >> 2;
 
 2393                     (
left[0] + 2 * 
left[-1] + top[0] + 2) >> 2;
 
 2394                 for (
i = 2 * 16 - 2; 
i >= 0; 
i--)
 
 2395                     filtered_top[
i] = (top[
i + 1] + 2 * top[
i] +
 
 2396                                        top[
i - 1] + 2) >> 2;
 
 2397                 left = filtered_left;
 
 2405         s->hpc.pred_planar[4 - 2] ((uint8_t *) 
src, (uint8_t *) top,
 
 2409         s->hpc.pred_dc((uint8_t *) 
src, (uint8_t *) top,
 
 2413         s->hpc.pred_angular[4 - 2] ((uint8_t *) 
src, (uint8_t *) top,
 
 2422     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 2423     v8i16 res0, res1, res2, res3;
 
 2424     v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
 
 2425     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
 2429     int hshift = 
sps->hshift[c_idx];
 
 2430     int vshift = 
sps->vshift[c_idx];
 
 2431     int size_in_luma_h = 32 << hshift;
 
 2432     int size_in_tbs_h = size_in_luma_h >> 
sps->log2_min_tb_size;
 
 2433     int size_in_luma_v = 32 << vshift;
 
 2434     int size_in_tbs_v = size_in_luma_v >> 
sps->log2_min_tb_size;
 
 2435     int x = x0 >> hshift;
 
 2436     int y = y0 >> vshift;
 
 2437     int x_tb = (x0 >> 
sps->log2_min_tb_size) & 
sps->tb_mask;
 
 2438     int y_tb = (y0 >> 
sps->log2_min_tb_size) & 
sps->tb_mask;
 
 2441         pps->min_tb_addr_zs[(y_tb) * (
sps->tb_mask + 2) + (x_tb)];
 
 2443     ptrdiff_t 
stride = 
s->frame->linesize[c_idx] / 
sizeof(uint8_t);
 
 2444     uint8_t *
src = (uint8_t *) 
s->frame->data[c_idx] + x + y * 
stride;
 
 2446     int min_pu_width = 
sps->min_pu_width;
 
 2451     uint8_t left_array[2 * 32 + 1];
 
 2452     uint8_t filtered_left_array[2 * 32 + 1];
 
 2453     uint8_t top_array[2 * 32 + 1];
 
 2454     uint8_t filtered_top_array[2 * 32 + 1];
 
 2456     uint8_t *
left = left_array + 1;
 
 2457     uint8_t *top = top_array + 1;
 
 2458     uint8_t *filtered_left = filtered_left_array + 1;
 
 2459     uint8_t *filtered_top = filtered_top_array + 1;
 
 2462         pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & 
sps->tb_mask) *
 
 2463                                (
sps->tb_mask + 2) + (x_tb - 1)];
 
 2469         pps->min_tb_addr_zs[(y_tb - 1) * (
sps->tb_mask + 2) +
 
 2470                                ((x_tb + size_in_tbs_h) & 
sps->tb_mask)];
 
 2472     int bottom_left_size =
 
 2473         (((y0 + 2 * size_in_luma_v) >
 
 2474           (
sps->height) ? (
sps->height) : (y0 +
 
 2475                                                  2 * size_in_luma_v)) -
 
 2476          (y0 + size_in_luma_v)) >> vshift;
 
 2477     int top_right_size =
 
 2478         (((x0 + 2 * size_in_luma_h) >
 
 2479           (
sps->width) ? (
sps->width) : (x0 + 2 * size_in_luma_h)) -
 
 2480          (x0 + size_in_luma_h)) >> hshift;
 
 2482     if (
pps->constrained_intra_pred_flag == 1) {
 
 2483         int size_in_luma_pu_v = ((size_in_luma_v) >> 
sps->log2_min_pu_size);
 
 2484         int size_in_luma_pu_h = ((size_in_luma_h) >> 
sps->log2_min_pu_size);
 
 2485         int on_pu_edge_x = !(x0 & ((1 << 
sps->log2_min_pu_size) - 1));
 
 2486         int on_pu_edge_y = !(y0 & ((1 << 
sps->log2_min_pu_size) - 1));
 
 2487         if (!size_in_luma_pu_h)
 
 2488             size_in_luma_pu_h++;
 
 2489         if (cand_bottom_left == 1 && on_pu_edge_x) {
 
 2490             int x_left_pu = ((x0 - 1) >> 
sps->log2_min_pu_size);
 
 2492                 ((y0 + size_in_luma_v) >> 
sps->log2_min_pu_size);
 
 2494                 ((size_in_luma_pu_v) >
 
 2495                  (
sps->min_pu_height -
 
 2496                   y_bottom_pu) ? (
sps->min_pu_height -
 
 2497                                   y_bottom_pu) : (size_in_luma_pu_v));
 
 2498             cand_bottom_left = 0;
 
 2499             for (
i = 0; 
i < 
max; 
i += 2)
 
 2501                     ((
s->cur_frame->tab_mvf[(x_left_pu) +
 
 2503                                        i) * min_pu_width]).pred_flag ==
 
 2506         if (cand_left == 1 && on_pu_edge_x) {
 
 2507             int x_left_pu = ((x0 - 1) >> 
sps->log2_min_pu_size);
 
 2508             int y_left_pu = ((y0) >> 
sps->log2_min_pu_size);
 
 2510                 ((size_in_luma_pu_v) >
 
 2511                  (
sps->min_pu_height -
 
 2512                   y_left_pu) ? (
sps->min_pu_height -
 
 2513                                 y_left_pu) : (size_in_luma_pu_v));
 
 2515             for (
i = 0; 
i < 
max; 
i += 2)
 
 2517                     ((
s->cur_frame->tab_mvf[(x_left_pu) +
 
 2519                                        i) * min_pu_width]).pred_flag ==
 
 2522         if (cand_up_left == 1) {
 
 2523             int x_left_pu = ((x0 - 1) >> 
sps->log2_min_pu_size);
 
 2524             int y_top_pu = ((y0 - 1) >> 
sps->log2_min_pu_size);
 
 2526                 (
s->cur_frame->tab_mvf[(x_left_pu) +
 
 2527                                  (y_top_pu) * min_pu_width]).pred_flag ==
 
 2530         if (cand_up == 1 && on_pu_edge_y) {
 
 2531             int x_top_pu = ((x0) >> 
sps->log2_min_pu_size);
 
 2532             int y_top_pu = ((y0 - 1) >> 
sps->log2_min_pu_size);
 
 2534                 ((size_in_luma_pu_h) >
 
 2535                  (
sps->min_pu_width -
 
 2536                   x_top_pu) ? (
sps->min_pu_width -
 
 2537                                x_top_pu) : (size_in_luma_pu_h));
 
 2539             for (
i = 0; 
i < 
max; 
i += 2)
 
 2541                     ((
s->cur_frame->tab_mvf[(x_top_pu + 
i) +
 
 2543                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2545         if (cand_up_right == 1 && on_pu_edge_y) {
 
 2546             int y_top_pu = ((y0 - 1) >> 
sps->log2_min_pu_size);
 
 2548                 ((x0 + size_in_luma_h) >> 
sps->log2_min_pu_size);
 
 2550                 ((size_in_luma_pu_h) >
 
 2551                  (
sps->min_pu_width -
 
 2552                   x_right_pu) ? (
sps->min_pu_width -
 
 2553                                  x_right_pu) : (size_in_luma_pu_h));
 
 2555             for (
i = 0; 
i < 
max; 
i += 2)
 
 2557                     ((
s->cur_frame->tab_mvf[(x_right_pu + 
i) +
 
 2559                                       min_pu_width]).pred_flag == 
PF_INTRA);
 
 2561         vec0 = (v16u8) __msa_ldi_b(128);
 
 2564         ST_UB4(vec0, vec0, vec0, vec0, top, 16);
 
 2574         ST_UB2(vec0, vec1, top, 16);
 
 2577     if (cand_up_right) {
 
 2579         ST_UB2(vec0, vec1, (top + 32), 16);
 
 2582                 ((
src[(32 + top_right_size - 1) + 
stride * (-1)]) *
 
 2584             for (
i = 0; 
i < (32 - top_right_size); 
i += 4)
 
 2590         for (
i = 0; 
i < 32; 
i++)
 
 2592     if (cand_bottom_left) {
 
 2593         for (
i = 32; 
i < 32 + bottom_left_size; 
i++)
 
 2597                 ((
src[(-1) + 
stride * (32 + bottom_left_size - 1)]) *
 
 2599             for (
i = 0; 
i < (32 - bottom_left_size); 
i += 4)
 
 2605     if (
pps->constrained_intra_pred_flag == 1) {
 
 2606         if (cand_bottom_left || cand_left || cand_up_left || cand_up
 
 2609                 x0 + ((2 * 32) << hshift) <
 
 2610                 sps->width ? 2 * 32 : (
sps->width - x0) >> hshift;
 
 2612                 y0 + ((2 * 32) << vshift) <
 
 2613                 sps->height ? 2 * 32 : (
sps->height - y0) >> vshift;
 
 2614             int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
 
 2615             if (!cand_up_right) {
 
 2616                 size_max_x = x0 + ((32) << hshift) < 
sps->width ?
 
 2617                     32 : (
sps->width - x0) >> hshift;
 
 2619             if (!cand_bottom_left) {
 
 2620                 size_max_y = y0 + ((32) << vshift) < 
sps->height ?
 
 2621                     32 : (
sps->height - y0) >> vshift;
 
 2623             if (cand_bottom_left || cand_left || cand_up_left) {
 
 2626                        !((
s->cur_frame->tab_mvf[(((x0 +
 
 2627                                              ((-1) << hshift)) >> 
sps->
 
 2628                                             log2_min_pu_size)) + (((y0 +
 
 2633                                           * min_pu_width]).pred_flag ==
 
 2637                     ((
s->cur_frame->tab_mvf[(((x0 +
 
 2638                                          ((-1) << hshift)) >> 
sps->
 
 2639                                         log2_min_pu_size)) + (((y0 + ((j)
 
 2644                                       * min_pu_width]).pred_flag == 
PF_INTRA)) {
 
 2646                     while (j < size_max_x
 
 2648                            !((
s->cur_frame->tab_mvf[(((x0 +
 
 2649                                                  ((j) << hshift)) >> 
sps->
 
 2650                                                 log2_min_pu_size)) + (((y0 +
 
 2655                                               * min_pu_width]).pred_flag ==
 
 2658                     for (
i = j; 
i > (j) - (j + 1); 
i--)
 
 2660                             ((
s->cur_frame->tab_mvf[(((x0 +
 
 2662                                                    1) << hshift)) >> 
sps->
 
 2663                                                 log2_min_pu_size)) + (((y0 +
 
 2668                                               * min_pu_width]).pred_flag ==
 
 2670                             top[
i - 1] = top[
i];
 
 2675                 while (j < size_max_x
 
 2677                        !((
s->cur_frame->tab_mvf[(((x0 +
 
 2678                                              ((j) << hshift)) >> 
sps->
 
 2679                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2684                                           * min_pu_width]).pred_flag ==
 
 2689                         for (
i = j; 
i > (j) - (j + 1); 
i--)
 
 2691                                 ((
s->cur_frame->tab_mvf[(((x0 +
 
 2694                                                     sps->log2_min_pu_size))
 
 2698                                                       sps->log2_min_pu_size))
 
 2700                                                   min_pu_width]).pred_flag ==
 
 2702                                 top[
i - 1] = top[
i];
 
 2704                         for (
i = j; 
i > (j) - (j); 
i--)
 
 2706                                 ((
s->cur_frame->tab_mvf[(((x0 +
 
 2709                                                     sps->log2_min_pu_size))
 
 2713                                                       sps->log2_min_pu_size))
 
 2715                                                   min_pu_width]).pred_flag ==
 
 2717                                 top[
i - 1] = top[
i];
 
 2723             if (cand_bottom_left || cand_left) {
 
 2724                 a = ((
left[-1]) * 0x01010101U);
 
 2725                 for (
i = 0; 
i < (0) + (size_max_y); 
i += 4)
 
 2727                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2728                                              ((-1) << hshift)) >> 
sps->
 
 2729                                             log2_min_pu_size)) + (((y0 +
 
 2734                                           * min_pu_width]).pred_flag ==
 
 2738                         a = ((
left[
i + 3]) * 0x01010101U);
 
 2741                 vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2745             if (!cand_bottom_left) {
 
 2746                 vec0 = (v16u8) __msa_fill_b(
left[31]);
 
 2750             if (x0 != 0 && y0 != 0) {
 
 2751                 a = ((
left[size_max_y - 1]) * 0x01010101U);
 
 2752                 for (
i = (size_max_y - 1);
 
 2753                      i > (size_max_y - 1) - (size_max_y); 
i -= 4)
 
 2755                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2756                                              ((-1) << hshift)) >> 
sps->
 
 2757                                             log2_min_pu_size)) + (((y0 +
 
 2763                                           * min_pu_width]).pred_flag ==
 
 2767                         a = ((
left[
i - 3]) * 0x01010101U);
 
 2769                     ((
s->cur_frame->tab_mvf[(((x0 +
 
 2770                                          ((-1) << hshift)) >> 
sps->
 
 2771                                         log2_min_pu_size)) + (((y0 + ((-1)
 
 2776                                       * min_pu_width]).pred_flag == 
PF_INTRA))
 
 2778             } 
else if (x0 == 0) {
 
 2780                     uint32_t 
pix = ((0) * 0x01010101U);
 
 2781                     for (
i = 0; 
i < (size_max_y); 
i += 4)
 
 2785                 a = ((
left[size_max_y - 1]) * 0x01010101U);
 
 2786                 for (
i = (size_max_y - 1);
 
 2787                      i > (size_max_y - 1) - (size_max_y); 
i -= 4)
 
 2789                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2790                                              ((-1) << hshift)) >> 
sps->
 
 2791                                             log2_min_pu_size)) + (((y0 +
 
 2797                                           * min_pu_width]).pred_flag ==
 
 2801                         a = ((
left[
i - 3]) * 0x01010101U);
 
 2805                 a = ((
left[-1]) * 0x01010101U);
 
 2806                 for (
i = 0; 
i < (0) + (size_max_x); 
i += 4)
 
 2808                         ((
s->cur_frame->tab_mvf[(((x0 +
 
 2809                                              ((
i) << hshift)) >> 
sps->
 
 2810                                             log2_min_pu_size)) + (((y0 + ((-1)
 
 2815                                           * min_pu_width]).pred_flag ==
 
 2819                         a = ((top[
i + 3]) * 0x01010101U);
 
 2824     if (!cand_bottom_left) {
 
 2826             vec0 = (v16u8) __msa_fill_b(
left[31]);
 
 2829         } 
else if (cand_up_left) {
 
 2830             vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2835         } 
else if (cand_up) {
 
 2838             vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2844         } 
else if (cand_up_right) {
 
 2845             vec0 = (v16u8) __msa_fill_b(top[32]);
 
 2847             ST_UB2(vec0, vec0, top, 16);
 
 2859             vec0 = (v16u8) __msa_ldi_b(128);
 
 2861             ST_UB4(vec0, vec0, vec0, vec0, top, 16);
 
 2867         vec0 = (v16u8) __msa_fill_b(
left[32]);
 
 2871     if (!cand_up_left) {
 
 2875         vec0 = (v16u8) __msa_fill_b(
left[-1]);
 
 2877         ST_UB2(vec0, vec0, top, 16);
 
 2879     if (!cand_up_right) {
 
 2880         vec0 = (v16u8) __msa_fill_b(top[31]);
 
 2882         ST_UB2(vec0, vec0, (top + 32), 16);
 
 2888     if (!
sps->intra_smoothing_disabled
 
 2889         && (c_idx == 0 || 
sps->chroma_format_idc == 3)) {
 
 2891             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
 
 2892             int min_dist_vert_hor =
 
 2893                 (((((int) (
mode - 26
U)) >=
 
 2894                    0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
 
 2895                  ((((int) (
mode - 10
U)) >=
 
 2896                    0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2897                  ? ((((int) (
mode - 10
U)) >=
 
 2898                      0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
 
 2899                  : ((((int) (
mode - 26
U)) >=
 
 2900                      0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
 
 2901             if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
 
 2902                 int threshold = 1 << (8 - 5);
 
 2903                 if (
sps->strong_intra_smoothing_enabled
 
 2905                     && ((top[-1] + top[63] - 2 * top[31]) >=
 
 2906                         0 ? (top[-1] + top[63] -
 
 2907                              2 * top[31]) : (-(top[-1] + top[63] -
 
 2908                                                2 * top[31]))) < threshold
 
 2912                                                 2 * 
left[31]))) < threshold) {
 
 2915                     filtered_top[-1] = top[-1];
 
 2916                     filtered_top[63] = top[63];
 
 2919                     for (
i = 0; 
i < 63; 
i++) {
 
 2921                             ((63 - 
i) * top[-1] + (
i + 1) * top[63] + 32) >> 6;
 
 2924                     tmp0 = __msa_fill_h(top[-1]);
 
 2925                     tmp1 = __msa_fill_h(top[63]);
 
 2927                     tmp2 = mul_val0 - 8;
 
 2928                     tmp3 = mul_val0 - 16;
 
 2929                     tmp4 = mul_val0 - 24;
 
 2930                     tmp5 = mul_val1 + 8;
 
 2931                     tmp6 = mul_val1 + 16;
 
 2932                     tmp7 = mul_val1 + 24;
 
 2934                     res0 = mul_val0 * tmp0;
 
 2938                     res0 += mul_val1 * tmp1;
 
 2939                     res1 += tmp5 * tmp1;
 
 2940                     res2 += tmp6 * tmp1;
 
 2941                     res3 += tmp7 * tmp1;
 
 2943                     res0 = __msa_srari_h(res0, 6);
 
 2944                     res1 = __msa_srari_h(res1, 6);
 
 2945                     res2 = __msa_srari_h(res2, 6);
 
 2946                     res3 = __msa_srari_h(res3, 6);
 
 2948                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 2949                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 2951                     ST_UB2(vec0, vec1, filtered_top, 16);
 
 2953                     res0 = mul_val0 - 32;
 
 2954                     tmp2 = mul_val0 - 40;
 
 2955                     tmp3 = mul_val0 - 48;
 
 2956                     tmp4 = mul_val0 - 56;
 
 2957                     res3 = mul_val1 + 32;
 
 2958                     tmp5 = mul_val1 + 40;
 
 2959                     tmp6 = mul_val1 + 48;
 
 2960                     tmp7 = mul_val1 + 56;
 
 2965                     res0 += res3 * tmp1;
 
 2967                     res1 += tmp5 * tmp1;
 
 2968                     res2 += tmp6 * tmp1;
 
 2969                     res3 += tmp7 * tmp1;
 
 2971                     res0 = __msa_srari_h(res0, 6);
 
 2972                     res1 = __msa_srari_h(res1, 6);
 
 2973                     res2 = __msa_srari_h(res2, 6);
 
 2974                     res3 = __msa_srari_h(res3, 6);
 
 2976                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 2977                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 2979                     ST_UB2(vec0, vec1, (filtered_top + 32), 16);
 
 2981                     filtered_top[63] = top[63];
 
 2983                     tmp0 = __msa_fill_h(
left[-1]);
 
 2984                     tmp1 = __msa_fill_h(
left[63]);
 
 2986                     tmp2 = mul_val0 - 8;
 
 2987                     tmp3 = mul_val0 - 16;
 
 2988                     tmp4 = mul_val0 - 24;
 
 2989                     tmp5 = mul_val1 + 8;
 
 2990                     tmp6 = mul_val1 + 16;
 
 2991                     tmp7 = mul_val1 + 24;
 
 2993                     res0 = mul_val0 * tmp0;
 
 2997                     res0 += mul_val1 * tmp1;
 
 2998                     res1 += tmp5 * tmp1;
 
 2999                     res2 += tmp6 * tmp1;
 
 3000                     res3 += tmp7 * tmp1;
 
 3002                     res0 = __msa_srari_h(res0, 6);
 
 3003                     res1 = __msa_srari_h(res1, 6);
 
 3004                     res2 = __msa_srari_h(res2, 6);
 
 3005                     res3 = __msa_srari_h(res3, 6);
 
 3007                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 3008                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 3012                     res0 = mul_val0 - 32;
 
 3013                     tmp2 = mul_val0 - 40;
 
 3014                     tmp3 = mul_val0 - 48;
 
 3015                     tmp4 = mul_val0 - 56;
 
 3016                     res3 = mul_val1 + 32;
 
 3017                     tmp5 = mul_val1 + 40;
 
 3018                     tmp6 = mul_val1 + 48;
 
 3019                     tmp7 = mul_val1 + 56;
 
 3024                     res0 += res3 * tmp1;
 
 3026                     res1 += tmp5 * tmp1;
 
 3027                     res2 += tmp6 * tmp1;
 
 3028                     res3 += tmp7 * tmp1;
 
 3030                     res0 = __msa_srari_h(res0, 6);
 
 3031                     res1 = __msa_srari_h(res1, 6);
 
 3032                     res2 = __msa_srari_h(res2, 6);
 
 3033                     res3 = __msa_srari_h(res3, 6);
 
 3035                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
 
 3036                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
 
 3044                     filtered_left[2 * 32 - 1] = 
left[2 * 32 - 1];
 
 3045                     filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
 
 3046                     for (
i = 2 * 32 - 2; 
i >= 0; 
i--)
 
 3048                                             left[
i - 1] + 2) >> 2;
 
 3051                         (
left[0] + 2 * 
left[-1] + top[0] + 2) >> 2;
 
 3052                     for (
i = 2 * 32 - 2; 
i >= 0; 
i--)
 
 3053                         filtered_top[
i] = (top[
i + 1] + 2 * top[
i] +
 
 3054                                            top[
i - 1] + 2) >> 2;
 
 3055                     left = filtered_left;
 
 3064         s->hpc.pred_planar[3] ((uint8_t *) 
src, (uint8_t *) top,
 
 3068         s->hpc.pred_dc((uint8_t *) 
src, (uint8_t *) top,
 
 3072         s->hpc.pred_angular[3] ((uint8_t *) 
src, (uint8_t *) top,