25                                     int16_t qadd, int8_t n_coeffs,
 
   28     int16_t *block_dup = 
block;
 
   30     v8i16 block_vec, qmul_vec, qadd_vec, sub;
 
   31     v8i16 add, 
mask, mul, zero_mask;
 
   33     qmul_vec = __msa_fill_h(qmul);
 
   34     qadd_vec = __msa_fill_h(qadd);
 
   35     for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
 
   36         block_vec = 
LD_SH(block_dup + loop_start);
 
   37         mask = __msa_clti_s_h(block_vec, 0);
 
   38         zero_mask = __msa_ceqi_h(block_vec, 0);
 
   39         mul = block_vec * qmul_vec;
 
   42         add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) 
mask);
 
   43         block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
 
   45         ST_SH(block_vec, block_dup + loop_start);
 
   49     cnt = ((n_coeffs >> 3) * 8) + loop_start;
 
   51     for (; cnt <= n_coeffs; cnt++) {
 
   66                                               const int16_t *quant_matrix)
 
   69     v8i16 block_vec, block_neg, qscale_vec, 
mask;
 
   70     v8i16 block_org0, block_org1, block_org2, block_org3;
 
   71     v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
 
   72     v8i16 sum, mul, zero_mask;
 
   73     v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
 
   74     v4i32 block_l, block_r, sad;
 
   76     qscale_vec = __msa_fill_h(qscale);
 
   77     for (cnt = 0; cnt < 2; cnt++) {
 
   78         LD_SH4(
block, 8, block_org0, block_org1, block_org2, block_org3);
 
   79         LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
 
   80         mask = __msa_clti_s_h(block_org0, 0);
 
   81         zero_mask = __msa_ceqi_h(block_org0, 0);
 
   82         block_neg = -block_org0;
 
   83         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
 
   90         mul_vec = block_l * qscale_l;
 
   92         block_l = mul_vec >> 4;
 
   93         mul_vec = block_r * qscale_r;
 
   95         block_r = mul_vec >> 4;
 
   96         mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
 
   98         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
 
  100         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
 
  105         sad = __msa_hadd_s_w(sum, sum);
 
  107         mask = __msa_clti_s_h(block_org1, 0);
 
  108         zero_mask = __msa_ceqi_h(block_org1, 0);
 
  109         block_neg = - block_org1;
 
  110         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
 
  117         mul_vec = block_l * qscale_l;
 
  118         mul_vec *= quant_m_l;
 
  119         block_l = mul_vec >> 4;
 
  120         mul_vec = block_r * qscale_r;
 
  121         mul_vec *= quant_m_r;
 
  122         block_r = mul_vec >> 4;
 
  123         mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
 
  125         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
 
  127         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
 
  133         sad = __msa_hadd_s_w(sum, sum);
 
  135         mask = __msa_clti_s_h(block_org2, 0);
 
  136         zero_mask = __msa_ceqi_h(block_org2, 0);
 
  137         block_neg = - block_org2;
 
  138         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
 
  145         mul_vec = block_l * qscale_l;
 
  146         mul_vec *= quant_m_l;
 
  147         block_l = mul_vec >> 4;
 
  148         mul_vec = block_r * qscale_r;
 
  149         mul_vec *= quant_m_r;
 
  150         block_r = mul_vec >> 4;
 
  151         mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
 
  153         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
 
  155         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
 
  161         sad = __msa_hadd_s_w(sum, sum);
 
  163         mask = __msa_clti_s_h(block_org3, 0);
 
  164         zero_mask = __msa_ceqi_h(block_org3, 0);
 
  165         block_neg = - block_org3;
 
  166         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
 
  173         mul_vec = block_l * qscale_l;
 
  174         mul_vec *= quant_m_l;
 
  175         block_l = mul_vec >> 4;
 
  176         mul_vec = block_r * qscale_r;
 
  177         mul_vec *= quant_m_r;
 
  178         block_r = mul_vec >> 4;
 
  179         mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
 
  181         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
 
  183         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
 
  189         sad = __msa_hadd_s_w(sum, sum);
 
  209         qadd = (qscale - 1) | 1;
 
  216         nCoeffs = 
s->inter_scantable.raster_end[
s->block_last_index[
index]];
 
  230     qadd = (qscale - 1) | 1;
 
  233     nCoeffs = 
s->inter_scantable.raster_end[
s->block_last_index[
index]];
 
  242     const uint16_t *quant_matrix;
 
  245     quant_matrix = 
s->inter_matrix;
 
  249     block[63] ^= sum & 1;