29               unsigned A1, 
unsigned A2,
 
   30               const void *_r, 
const void *_g, 
const void *_b, 
int y,
 
   35         uint32_t *dest = (uint32_t *) _dest;
 
   36         const uint32_t *
r = (
const uint32_t *) _r;
 
   37         const uint32_t *
g = (
const uint32_t *) _g;
 
   38         const uint32_t *
b = (
const uint32_t *) _b;
 
   41         dest[
i * 2 + 0] = 
r[Y1] + 
g[Y1] + 
b[Y1];
 
   42         dest[
i * 2 + 1] = 
r[Y2] + 
g[Y2] + 
b[Y2];
 
   44 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1 
   49         dest[
i * 2 + 0] = 
r[Y1] + 
g[Y1] + 
b[Y1];
 
   50         dest[
i * 2 + 1] = 
r[Y2] + 
g[Y2] + 
b[Y2];
 
   53         uint8_t *dest = (uint8_t *) _dest;
 
   54         const uint8_t *
r = (
const uint8_t *) _r;
 
   55         const uint8_t *
g = (
const uint8_t *) _g;
 
   56         const uint8_t *
b = (
const uint8_t *) _b;
 
   58 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b) 
   59 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r) 
   61         dest[
i * 6 + 0] = 
r_b[Y1];
 
   62         dest[
i * 6 + 1] =   
g[Y1];
 
   63         dest[
i * 6 + 2] = 
b_r[Y1];
 
   64         dest[
i * 6 + 3] = 
r_b[Y2];
 
   65         dest[
i * 6 + 4] =   
g[Y2];
 
   66         dest[
i * 6 + 5] = 
b_r[Y2];
 
   72         uint16_t *dest = (uint16_t *) _dest;
 
   73         const uint16_t *
r = (
const uint16_t *) _r;
 
   74         const uint16_t *
g = (
const uint16_t *) _g;
 
   75         const uint16_t *
b = (
const uint16_t *) _b;
 
   76         int dr1, dg1, db1, dr2, dg2, db2;
 
  101         dest[
i * 2 + 0] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1];
 
  102         dest[
i * 2 + 1] = 
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2];
 
  104         uint8_t *dest = (uint8_t *) _dest;
 
  105         const uint8_t *
r = (
const uint8_t *) _r;
 
  106         const uint8_t *
g = (
const uint8_t *) _g;
 
  107         const uint8_t *
b = (
const uint8_t *) _b;
 
  108         int dr1, dg1, db1, dr2, dg2, db2;
 
  113             dr1 = dg1 = 
d32[(
i * 2 + 0) & 7];
 
  114             db1 =       
d64[(
i * 2 + 0) & 7];
 
  115             dr2 = dg2 = 
d32[(
i * 2 + 1) & 7];
 
  116             db2 =       
d64[(
i * 2 + 1) & 7];
 
  120             dr1 = db1 = 
d128[(
i * 2 + 0) & 7];
 
  121             dg1 =        
d64[(
i * 2 + 0) & 7];
 
  122             dr2 = db2 = 
d128[(
i * 2 + 1) & 7];
 
  123             dg2 =        
d64[(
i * 2 + 1) & 7];
 
  127             dest[
i] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1] +
 
  128                     ((
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2]) << 4);
 
  130             dest[
i * 2 + 0] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1];
 
  131             dest[
i * 2 + 1] = 
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2];
 
  136 #define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \ 
  138     Y1 = __lsx_vpickve2gr_w(vec_y1, t1);                                \ 
  139     Y2 = __lsx_vpickve2gr_w(vec_y2, t2);                                \ 
  140     U  = __lsx_vpickve2gr_w(vec_u, t3);                                 \ 
  141     V  = __lsx_vpickve2gr_w(vec_v, t4);                                 \ 
  142     r  =  c->table_rV[V];                                               \ 
  143     g  = (c->table_gU[U] + c->table_gV[V]);                             \ 
  144     b  =  c->table_bU[U];                                               \ 
  145     yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                            \ 
  146                   r, g, b, y, target, 0);                               \ 
  152                        const int16_t **lumSrc, 
int lumFilterSize,
 
  153                        const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  154                        const int16_t **chrVSrc, 
int chrFilterSize,
 
  155                        const int16_t **alpSrc, uint8_t *dest, 
int dstW,
 
  163     int len_count = (dstW + 1) >> 1;
 
  164     const void *
r, *
g, *
b;
 
  166     __m128i 
headroom  = __lsx_vreplgr2vr_w(head);
 
  168     for (
i = 0; 
i < 
len; 
i++) {
 
  169         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  170         __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
 
  171         __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
 
  172         __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2, 
temp;
 
  174         yl_ev  = __lsx_vldrepl_w(&t, 0);
 
  192         for (j = 0; j < lumFilterSize; j++) {
 
  193             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  194             DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
 
  196             DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
 
  198             yl_ev1  = __lsx_vmaddwev_w_h(yl_ev1, 
temp, l_src1);
 
  199             yl_od1  = __lsx_vmaddwod_w_h(yl_od1, 
temp, l_src1);
 
  200             yh_ev1  = __lsx_vmaddwev_w_h(yh_ev1, 
temp, l_src3);
 
  201             yh_od1  = __lsx_vmaddwod_w_h(yh_od1, 
temp, l_src3);
 
  202             yl_ev2  = __lsx_vmaddwev_w_h(yl_ev2, 
temp, l_src2);
 
  203             yl_od2  = __lsx_vmaddwod_w_h(yl_od2, 
temp, l_src2);
 
  204             yh_ev2  = __lsx_vmaddwev_w_h(yh_ev2, 
temp, l_src4);
 
  205             yh_od2  = __lsx_vmaddwod_w_h(yh_od2, 
temp, l_src4);
 
  207         for (j = 0; j < chrFilterSize; j++) {
 
  208             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  210             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
 
  212             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  213             u_ev1 = __lsx_vmaddwev_w_h(u_ev1, 
temp, u_src1);
 
  214             u_od1 = __lsx_vmaddwod_w_h(u_od1, 
temp, u_src1);
 
  215             v_ev1 = __lsx_vmaddwev_w_h(v_ev1, 
temp, v_src1);
 
  216             v_od1 = __lsx_vmaddwod_w_h(v_od1, 
temp, v_src1);
 
  217             u_ev2 = __lsx_vmaddwev_w_h(u_ev2, 
temp, u_src2);
 
  218             u_od2 = __lsx_vmaddwod_w_h(u_od2, 
temp, u_src2);
 
  219             v_ev2 = __lsx_vmaddwev_w_h(v_ev2, 
temp, v_src2);
 
  220             v_od2 = __lsx_vmaddwod_w_h(v_od2, 
temp, v_src2);
 
  222         yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
 
  223         yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
 
  224         yl_od1 = __lsx_vsrai_w(yl_od1, 19);
 
  225         yh_od1 = __lsx_vsrai_w(yh_od1, 19);
 
  226         u_ev1  = __lsx_vsrai_w(u_ev1, 19);
 
  227         v_ev1  = __lsx_vsrai_w(v_ev1, 19);
 
  228         u_od1  = __lsx_vsrai_w(u_od1, 19);
 
  229         v_od1  = __lsx_vsrai_w(v_od1, 19);
 
  230         yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
 
  231         yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
 
  232         yl_od2 = __lsx_vsrai_w(yl_od2, 19);
 
  233         yh_od2 = __lsx_vsrai_w(yh_od2, 19);
 
  234         u_ev2  = __lsx_vsrai_w(u_ev2, 19);
 
  235         v_ev2  = __lsx_vsrai_w(v_ev2, 19);
 
  236         u_od2  = __lsx_vsrai_w(u_od2, 19);
 
  237         v_od2  = __lsx_vsrai_w(v_od2, 19);
 
  238         u_ev1  = __lsx_vadd_w(u_ev1, 
headroom);
 
  239         v_ev1  = __lsx_vadd_w(v_ev1, 
headroom);
 
  240         u_od1  = __lsx_vadd_w(u_od1, 
headroom);
 
  241         v_od1  = __lsx_vadd_w(v_od1, 
headroom);
 
  242         u_ev2  = __lsx_vadd_w(u_ev2, 
headroom);
 
  243         v_ev2  = __lsx_vadd_w(v_ev2, 
headroom);
 
  244         u_od2  = __lsx_vadd_w(u_od2, 
headroom);
 
  245         v_od2  = __lsx_vadd_w(v_od2, 
headroom);
 
  266         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  267         __m128i l_src1, l_src2, u_src1, v_src1;
 
  268         __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
 
  269         __m128i u_ev1, u_od1, v_ev1, v_od1, 
temp;
 
  271         yl_ev  = __lsx_vldrepl_w(&t, 0);
 
  281         for (j = 0; j < lumFilterSize; j++) {
 
  282             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  283             DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
 
  285             yl_ev1  = __lsx_vmaddwev_w_h(yl_ev1, 
temp, l_src1);
 
  286             yl_od1  = __lsx_vmaddwod_w_h(yl_od1, 
temp, l_src1);
 
  287             yl_ev2  = __lsx_vmaddwev_w_h(yl_ev2, 
temp, l_src2);
 
  288             yl_od2  = __lsx_vmaddwod_w_h(yl_od2, 
temp, l_src2);
 
  290         for (j = 0; j < chrFilterSize; j++) {
 
  291             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  293             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  294             u_ev1 = __lsx_vmaddwev_w_h(u_ev1, 
temp, u_src1);
 
  295             u_od1 = __lsx_vmaddwod_w_h(u_od1, 
temp, u_src1);
 
  296             v_ev1 = __lsx_vmaddwev_w_h(v_ev1, 
temp, v_src1);
 
  297             v_od1 = __lsx_vmaddwod_w_h(v_od1, 
temp, v_src1);
 
  299         yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
 
  300         yl_od1 = __lsx_vsrai_w(yl_od1, 19);
 
  301         u_ev1  = __lsx_vsrai_w(u_ev1, 19);
 
  302         v_ev1  = __lsx_vsrai_w(v_ev1, 19);
 
  303         u_od1  = __lsx_vsrai_w(u_od1, 19);
 
  304         v_od1  = __lsx_vsrai_w(v_od1, 19);
 
  305         yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
 
  306         yl_od2 = __lsx_vsrai_w(yl_od2, 19);
 
  307         u_ev1  = __lsx_vadd_w(u_ev1, 
headroom);
 
  308         v_ev1  = __lsx_vadd_w(v_ev1, 
headroom);
 
  309         u_od1  = __lsx_vadd_w(u_od1, 
headroom);
 
  310         v_od1  = __lsx_vadd_w(v_od1, 
headroom);
 
  324         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  325         __m128i l_src1, u_src, v_src;
 
  326         __m128i yl_ev, yl_od;
 
  327         __m128i u_ev, u_od, v_ev, v_od, 
temp;
 
  329         yl_ev = __lsx_vldrepl_w(&t, 0);
 
  335         for (j = 0; j < lumFilterSize; j++) {
 
  336             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  337             l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
 
  338             yl_ev  = __lsx_vmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  339             yl_od  = __lsx_vmaddwod_w_h(yl_od, 
temp, l_src1);
 
  341         for (j = 0; j < chrFilterSize; j++) {
 
  342             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  344             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  345             u_ev  = __lsx_vmaddwev_w_h(u_ev, 
temp, u_src);
 
  346             u_od  = __lsx_vmaddwod_w_h(u_od, 
temp, u_src);
 
  347             v_ev  = __lsx_vmaddwev_w_h(v_ev, 
temp, v_src);
 
  348             v_od  = __lsx_vmaddwod_w_h(v_od, 
temp, v_src);
 
  350         yl_ev = __lsx_vsrai_w(yl_ev, 19);
 
  351         yl_od = __lsx_vsrai_w(yl_od, 19);
 
  352         u_ev  = __lsx_vsrai_w(u_ev, 19);
 
  353         v_ev  = __lsx_vsrai_w(v_ev, 19);
 
  354         u_od  = __lsx_vsrai_w(u_od, 19);
 
  355         v_od  = __lsx_vsrai_w(v_od, 19);
 
  356         u_ev  = __lsx_vadd_w(u_ev, 
headroom);
 
  357         v_ev  = __lsx_vadd_w(v_ev, 
headroom);
 
  358         u_od  = __lsx_vadd_w(u_od, 
headroom);
 
  359         v_od  = __lsx_vadd_w(v_od, 
headroom);
 
  368         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  369         __m128i l_src1, u_src, v_src;
 
  370         __m128i yl_ev, yl_od;
 
  371         __m128i u_ev, u_od, v_ev, v_od, 
temp;
 
  373         yl_ev = __lsx_vldrepl_w(&t, 0);
 
  379         for (j = 0; j < lumFilterSize; j++) {
 
  380             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  381             l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
 
  382             yl_ev  = __lsx_vmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  383             yl_od  = __lsx_vmaddwod_w_h(yl_od, 
temp, l_src1);
 
  385         for (j = 0; j < chrFilterSize; j++) {
 
  386             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  388             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  389             u_ev  = __lsx_vmaddwev_w_h(u_ev, 
temp, u_src);
 
  390             u_od  = __lsx_vmaddwod_w_h(u_od, 
temp, u_src);
 
  391             v_ev  = __lsx_vmaddwev_w_h(v_ev, 
temp, v_src);
 
  392             v_od  = __lsx_vmaddwod_w_h(v_od, 
temp, v_src);
 
  394         yl_ev = __lsx_vsrai_w(yl_ev, 19);
 
  395         yl_od = __lsx_vsrai_w(yl_od, 19);
 
  396         u_ev  = __lsx_vsrai_w(u_ev, 19);
 
  397         v_ev  = __lsx_vsrai_w(v_ev, 19);
 
  398         u_od  = __lsx_vsrai_w(u_od, 19);
 
  399         v_od  = __lsx_vsrai_w(v_od, 19);
 
  400         u_ev  = __lsx_vadd_w(u_ev, 
headroom);
 
  401         v_ev  = __lsx_vadd_w(v_ev, 
headroom);
 
  402         u_od  = __lsx_vadd_w(u_od, 
headroom);
 
  403         v_od  = __lsx_vadd_w(v_od, 
headroom);
 
  410         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  411         __m128i l_src1, u_src, v_src;
 
  412         __m128i yl_ev, yl_od;
 
  413         __m128i u_ev, u_od, v_ev, v_od, 
temp;
 
  415         yl_ev = __lsx_vldrepl_w(&t, 0);
 
  421         for (j = 0; j < lumFilterSize; j++) {
 
  422             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  423             l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
 
  424             yl_ev  = __lsx_vmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  425             yl_od  = __lsx_vmaddwod_w_h(yl_od, 
temp, l_src1);
 
  427         for (j = 0; j < chrFilterSize; j++) {
 
  428             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  430             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  431             u_ev  = __lsx_vmaddwev_w_h(u_ev, 
temp, u_src);
 
  432             u_od  = __lsx_vmaddwod_w_h(u_od, 
temp, u_src);
 
  433             v_ev  = __lsx_vmaddwev_w_h(v_ev, 
temp, v_src);
 
  434             v_od  = __lsx_vmaddwod_w_h(v_od, 
temp, v_src);
 
  436         yl_ev = __lsx_vsrai_w(yl_ev, 19);
 
  437         yl_od = __lsx_vsrai_w(yl_od, 19);
 
  438         u_ev  = __lsx_vsrai_w(u_ev, 19);
 
  439         v_ev  = __lsx_vsrai_w(v_ev, 19);
 
  440         u_od  = __lsx_vsrai_w(u_od, 19);
 
  441         v_od  = __lsx_vsrai_w(v_od, 19);
 
  442         u_ev  = __lsx_vadd_w(u_ev, 
headroom);
 
  443         v_ev  = __lsx_vadd_w(v_ev, 
headroom);
 
  444         u_od  = __lsx_vadd_w(u_od, 
headroom);
 
  445         v_od  = __lsx_vadd_w(v_od, 
headroom);
 
  450     for (; count < len_count; count++) {
 
  456         for (j = 0; j < lumFilterSize; j++) {
 
  457             Y1 += lumSrc[j][count * 2]     * lumFilter[j];
 
  458             Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
 
  460         for (j = 0; j < chrFilterSize; j++) {
 
  461             U += chrUSrc[j][count] * chrFilter[j];
 
  462             V += chrVSrc[j][count] * chrFilter[j];
 
  474                       r, 
g, 
b, y, target, 0);
 
  480                        const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  481                        const int16_t *abuf[2], uint8_t *dest, 
int dstW,
 
  482                        int yalpha, 
int uvalpha, 
int y,
 
  485     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  486                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 
  487                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 
  488     int yalpha1   = 4096 - yalpha;
 
  489     int uvalpha1  = 4096 - uvalpha;
 
  492     int len_count = (dstW + 1) >> 1;
 
  493     const void *
r, *
g, *
b;
 
  495     __m128i v_yalpha1  = __lsx_vreplgr2vr_w(yalpha1);
 
  496     __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
 
  497     __m128i v_yalpha   = __lsx_vreplgr2vr_w(yalpha);
 
  498     __m128i v_uvalpha  = __lsx_vreplgr2vr_w(uvalpha);
 
  499     __m128i 
headroom   = __lsx_vreplgr2vr_w(head);
 
  500     __m128i 
zero       = __lsx_vldi(0);
 
  502     for (
i = 0; 
i < 
len; 
i += 8) {
 
  505         int c_dex = count << 1;
 
  506         __m128i y0_h, y0_l, y0, u0, 
v0;
 
  507         __m128i y1_h, y1_l, y1, u1, v1;
 
  508         __m128i y_l, y_h, 
u, v;
 
  510         DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
 
  511                   buf1, i_dex, y0, u0, 
v0, y1);
 
  512         DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
 
  513         DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
 
  514         DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
 
  517         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
  518         y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
 
  519         u0   = __lsx_vmul_w(u0, v_uvalpha1);
 
  520         v0   = __lsx_vmul_w(
v0, v_uvalpha1);
 
  521         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
  522         y_h  = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
 
  523         u    = __lsx_vmadd_w(u0, v_uvalpha, u1);
 
  524         v    = __lsx_vmadd_w(
v0, v_uvalpha, v1);
 
  525         y_l  = __lsx_vsrai_w(y_l, 19);
 
  526         y_h  = __lsx_vsrai_w(y_h, 19);
 
  527         u    = __lsx_vsrai_w(
u, 19);
 
  528         v    = __lsx_vsrai_w(v, 19);
 
  539         __m128i y0_l, y0, u0, 
v0;
 
  540         __m128i y1_l, y1, u1, v1;
 
  543         y0   = __lsx_vldx(buf0, i_dex);
 
  544         u0   = __lsx_vldrepl_d((ubuf0 + count), 0);
 
  545         v0   = __lsx_vldrepl_d((vbuf0 + count), 0);
 
  546         y1   = __lsx_vldx(buf1, i_dex);
 
  547         u1   = __lsx_vldrepl_d((ubuf1 + count), 0);
 
  548         v1   = __lsx_vldrepl_d((vbuf1 + count), 0);
 
  552         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
  553         u0   = __lsx_vmul_w(u0, v_uvalpha1);
 
  554         v0   = __lsx_vmul_w(
v0, v_uvalpha1);
 
  555         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
  556         u    = __lsx_vmadd_w(u0, v_uvalpha, u1);
 
  557         v    = __lsx_vmadd_w(
v0, v_uvalpha, v1);
 
  558         y_l  = __lsx_vsrai_w(y_l, 19);
 
  559         u    = __lsx_vsrai_w(
u, 19);
 
  560         v    = __lsx_vsrai_w(v, 19);
 
  567     for (; count < len_count; count++) {
 
  568         int Y1 = (buf0[count * 2]     * yalpha1  +
 
  569                   buf1[count * 2]     * yalpha)  >> 19;
 
  570         int Y2 = (buf0[count * 2 + 1] * yalpha1  +
 
  571                   buf1[count * 2 + 1] * yalpha) >> 19;
 
  572         int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
 
  573         int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
 
  581                       r, 
g, 
b, y, target, 0);
 
  587                        const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  588                        const int16_t *abuf0, uint8_t *dest, 
int dstW,
 
  592     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
 
  594     int len       = (dstW - 7);
 
  595     int len_count = (dstW + 1) >> 1;
 
  596     const void *
r, *
g, *
b;
 
  598     if (uvalpha < 2048) {
 
  601         __m128i 
headroom  = __lsx_vreplgr2vr_h(head);
 
  603         for (
i = 0; 
i < 
len; 
i += 8) {
 
  606             int c_dex = count << 1;
 
  607             __m128i src_y, src_u, src_v;
 
  608             __m128i 
u, v, uv, y_l, y_h;
 
  610             src_y = __lsx_vldx(buf0, i_dex);
 
  611             DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
 
  612             src_y = __lsx_vsrari_h(src_y, 7);
 
  613             src_u = __lsx_vsrari_h(src_u, 7);
 
  614             src_v = __lsx_vsrari_h(src_v, 7);
 
  615             y_l   = __lsx_vsllwil_w_h(src_y, 0);
 
  616             y_h   = __lsx_vexth_w_h(src_y);
 
  617             uv    = __lsx_vilvl_h(src_v, src_u);
 
  619             v     = __lsx_vaddwod_w_h(uv, 
headroom);
 
  628             __m128i src_y, src_u, src_v;
 
  629             __m128i y_l, 
u, v, uv;
 
  631             src_y  = __lsx_vldx(buf0, i_dex);
 
  632             src_u  = __lsx_vldrepl_d((ubuf0 + count), 0);
 
  633             src_v  = __lsx_vldrepl_d((vbuf0 + count), 0);
 
  634             y_l    = __lsx_vsrari_h(src_y, 7);
 
  635             y_l    = __lsx_vsllwil_w_h(y_l, 0);
 
  636             uv     = __lsx_vilvl_h(src_v, src_u);
 
  637             uv     = __lsx_vsrari_h(uv, 7);
 
  639             v      = __lsx_vaddwod_w_h(uv, 
headroom);
 
  644         for (; count < len_count; count++) {
 
  645             int Y1 = (buf0[count * 2    ] + 64) >> 7;
 
  646             int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
 
  647             int U  = (ubuf0[count]        + 64) >> 7;
 
  648             int V  = (vbuf0[count]        + 64) >> 7;
 
  656                           r, 
g, 
b, y, target, 0);
 
  659         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
 
  662         __m128i 
headroom    = __lsx_vreplgr2vr_w(HEADROOM);
 
  664         for (
i = 0; 
i < 
len; 
i += 8) {
 
  667             int c_dex = count << 1;
 
  668             __m128i src_y, src_u0, src_v0, src_u1, src_v1;
 
  669             __m128i y_l, y_h, u1, u2, v1, v2;
 
  671             DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
 
  672                       ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
 
  673             src_v1 = __lsx_vldx(vbuf1, c_dex);
 
  674             src_y  = __lsx_vsrari_h(src_y, 7);
 
  675             u1      = __lsx_vaddwev_w_h(src_u0, src_u1);
 
  676             v1      = __lsx_vaddwod_w_h(src_u0, src_u1);
 
  677             u2      = __lsx_vaddwev_w_h(src_v0, src_v1);
 
  678             v2      = __lsx_vaddwod_w_h(src_v0, src_v1);
 
  679             y_l     = __lsx_vsllwil_w_h(src_y, 0);
 
  680             y_h     = __lsx_vexth_w_h(src_y);
 
  681             u1      = __lsx_vsrari_w(u1, 8);
 
  682             v1      = __lsx_vsrari_w(v1, 8);
 
  683             u2      = __lsx_vsrari_w(u2, 8);
 
  684             v2      = __lsx_vsrari_w(v2, 8);
 
  697             __m128i src_y, src_u0, src_v0, src_u1, src_v1;
 
  700             src_y  = __lsx_vldx(buf0, i_dex);
 
  701             src_u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
 
  702             src_v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
 
  703             src_u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
 
  704             src_v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
 
  706             src_u0 = __lsx_vilvl_h(src_u1, src_u0);
 
  707             src_v0 = __lsx_vilvl_h(src_v1, src_v0);
 
  708             src_y  = __lsx_vsrari_h(src_y, 7);
 
  709             src_y  = __lsx_vsllwil_w_h(src_y, 0);
 
  710             uv     = __lsx_vilvl_h(src_v0, src_u0);
 
  711             uv     = __lsx_vhaddw_w_h(uv, uv);
 
  712             uv     = __lsx_vsrari_w(uv, 8);
 
  718         for (; count < len_count; count++) {
 
  719             int Y1 = (buf0[count * 2    ]         +  64) >> 7;
 
  720             int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
 
  721             int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
 
  722             int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
 
  730                           r, 
g, 
b, y, target, 0);
 
  735 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                               \ 
  736 static void name ## ext ## _X_lsx(SwsContext *c, const int16_t *lumFilter,            \ 
  737                                   const int16_t **lumSrc, int lumFilterSize,          \ 
  738                                   const int16_t *chrFilter, const int16_t **chrUSrc,  \ 
  739                                   const int16_t **chrVSrc, int chrFilterSize,         \ 
  740                                   const int16_t **alpSrc, uint8_t *dest, int dstW,    \ 
  743     name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize,              \ 
  744                                     chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \ 
  745                                     alpSrc, dest, dstW, y, fmt, hasAlpha);            \ 
  748 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                              \ 
  749 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                       \ 
  750 static void name ## ext ## _2_lsx(SwsContext *c, const int16_t *buf[2],               \ 
  751                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \ 
  752                                   const int16_t *abuf[2], uint8_t *dest, int dstW,    \ 
  753                                   int yalpha, int uvalpha, int y)                     \ 
  755     name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest,                   \ 
  756                                     dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \ 
  759 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                \ 
  760 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                      \ 
  761 static void name ## ext ## _1_lsx(SwsContext *c, const int16_t *buf0,                 \ 
  762                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \ 
  763                                   const int16_t *abuf0, uint8_t *dest, int dstW,      \ 
  764                                   int uvalpha, int y)                                 \ 
  766     name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest,                 \ 
  767                                     dstW, uvalpha, y, fmt, hasAlpha);                 \ 
  772 #if CONFIG_SWSCALE_ALPHA 
  788     uint8_t *dest, 
int i, 
int R, 
int A, 
int G, 
int B,
 
  793     if ((
R | 
G | 
B) & 0xC0000000) {
 
  801         dest[0] = hasAlpha ? 
A : 255;
 
  815         dest[3] = hasAlpha ? 
A : 255;
 
  818         dest[0] = hasAlpha ? 
A : 255;
 
  832         dest[3] = hasAlpha ? 
A : 255;
 
  848             R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
 
  849             G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
 
  850             B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
 
  851             c->dither_error[0][
i] = err[0];
 
  852             c->dither_error[1][
i] = err[1];
 
  853             c->dither_error[2][
i] = err[2];
 
  854             r = 
R >> (isrgb8 ? 5 : 7);
 
  855             g = 
G >> (isrgb8 ? 5 : 6);
 
  856             b = 
B >> (isrgb8 ? 6 : 7);
 
  860             err[0] = 
R - 
r*(isrgb8 ? 36 : 255);
 
  861             err[1] = 
G - 
g*(isrgb8 ? 36 : 85);
 
  862             err[2] = 
B - 
b*(isrgb8 ? 85 : 255);
 
  867 #define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff)) 
  886 #define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2) 
  906             dest[0] = 
r + 2*
g + 8*
b;
 
  908             dest[0] = 
b + 2*
g + 8*
r;
 
  910             dest[0] = 
r + 8*
g + 64*
b;
 
  912             dest[0] = 
b + 4*
g + 32*
r;
 
  919 #define YUVTORGB_SETUP_LSX                                   \ 
  920     int y_offset   = c->yuv2rgb_y_offset;                    \ 
  921     int y_coeff    = c->yuv2rgb_y_coeff;                     \ 
  922     int v2r_coe    = c->yuv2rgb_v2r_coeff;                   \ 
  923     int v2g_coe    = c->yuv2rgb_v2g_coeff;                   \ 
  924     int u2g_coe    = c->yuv2rgb_u2g_coeff;                   \ 
  925     int u2b_coe    = c->yuv2rgb_u2b_coeff;                   \ 
  926     __m128i offset = __lsx_vreplgr2vr_w(y_offset);           \ 
  927     __m128i coeff  = __lsx_vreplgr2vr_w(y_coeff);            \ 
  928     __m128i v2r    = __lsx_vreplgr2vr_w(v2r_coe);            \ 
  929     __m128i v2g    = __lsx_vreplgr2vr_w(v2g_coe);            \ 
  930     __m128i u2g    = __lsx_vreplgr2vr_w(u2g_coe);            \ 
  931     __m128i u2b    = __lsx_vreplgr2vr_w(u2b_coe);            \ 
  933 #define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff,        \ 
  934                      y_temp, v2r, v2g, u2g, u2b)             \ 
  936      y = __lsx_vsub_w(y, offset);                            \ 
  937      y = __lsx_vmul_w(y, coeff);                             \ 
  938      y = __lsx_vadd_w(y, y_temp);                            \ 
  939      R = __lsx_vmadd_w(y, v, v2r);                           \ 
  940      v = __lsx_vmadd_w(y, v, v2g);                           \ 
  941      G = __lsx_vmadd_w(v, u, u2g);                           \ 
  942      B = __lsx_vmadd_w(y, u, u2b);                           \ 
  945 #define WRITE_FULL_A_LSX(r, g, b, a, t1, s)                                  \ 
  947     R = __lsx_vpickve2gr_w(r, t1);                                           \ 
  948     G = __lsx_vpickve2gr_w(g, t1);                                           \ 
  949     B = __lsx_vpickve2gr_w(b, t1);                                           \ 
  950     A = __lsx_vpickve2gr_w(a, t1);                                           \ 
  952         A = av_clip_uint8(A);                                                \ 
  953     yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\ 
  957 #define WRITE_FULL_LSX(r, g, b, t1, s)                                        \ 
  959     R = __lsx_vpickve2gr_w(r, t1);                                            \ 
  960     G = __lsx_vpickve2gr_w(g, t1);                                            \ 
  961     B = __lsx_vpickve2gr_w(b, t1);                                            \ 
  962     yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \ 
  968                             const int16_t **lumSrc, 
int lumFilterSize,
 
  969                             const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  970                             const int16_t **chrVSrc, 
int chrFilterSize,
 
  971                             const int16_t **alpSrc, uint8_t *dest,
 
  975     int i, j, 
B, 
G, 
R, 
A;
 
  979     int a_temp     = 1 << 18;
 
  981     int tempc      = templ - (128 << 19);
 
  984     __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
 
  991     for (
i = 0; 
i < 
len; 
i += 8) {
 
  992         __m128i l_src, u_src, v_src;
 
  993         __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od, 
temp;
 
  994         __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
 
  997         y_ev = y_od = __lsx_vreplgr2vr_w(templ);
 
  998         u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
 
  999         for (j = 0; j < lumFilterSize; j++) {
 
 1000             temp  = __lsx_vldrepl_h((lumFilter + j), 0);
 
 1001             l_src = __lsx_vldx(lumSrc[j], n);
 
 1002             y_ev  = __lsx_vmaddwev_w_h(y_ev, l_src, 
temp);
 
 1003             y_od  = __lsx_vmaddwod_w_h(y_od, l_src, 
temp);
 
 1005         for (j = 0; j < chrFilterSize; j++) {
 
 1006             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
 1007             DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
 
 1010                       v_src, 
temp, u_ev, v_ev);
 
 1012                       v_src, 
temp, u_od, v_od);
 
 1014         y_ev = __lsx_vsrai_w(y_ev, 10);
 
 1015         y_od = __lsx_vsrai_w(y_od, 10);
 
 1016         u_ev = __lsx_vsrai_w(u_ev, 10);
 
 1017         u_od = __lsx_vsrai_w(u_od, 10);
 
 1018         v_ev = __lsx_vsrai_w(v_ev, 10);
 
 1019         v_od = __lsx_vsrai_w(v_od, 10);
 
 1021                      y_temp, v2r, v2g, u2g, u2b);
 
 1023                      y_temp, v2r, v2g, u2g, u2b);
 
 1026             __m128i a_src, a_ev, a_od;
 
 1028             a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
 
 1029             for (j = 0; j < lumFilterSize; j++) {
 
 1030                 temp  = __lsx_vldrepl_h(lumFilter + j, 0);
 
 1031                 a_src = __lsx_vldx(alpSrc[j], n);
 
 1032                 a_ev  = __lsx_vmaddwev_w_h(a_ev, a_src, 
temp);
 
 1033                 a_od  = __lsx_vmaddwod_w_h(a_od, a_src, 
temp);
 
 1035             a_ev = __lsx_vsrai_w(a_ev, 19);
 
 1036             a_od = __lsx_vsrai_w(a_od, 19);
 
 1056     if (dstW - 
i >= 4) {
 
 1057         __m128i l_src, u_src, v_src;
 
 1058         __m128i y_ev, u_ev, v_ev, uv, 
temp;
 
 1059         __m128i R_ev, G_ev, B_ev;
 
 1062         y_ev = __lsx_vreplgr2vr_w(templ);
 
 1063         u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
 
 1064         for (j = 0; j < lumFilterSize; j++) {
 
 1065             temp  = __lsx_vldrepl_h((lumFilter + j), 0);
 
 1066             l_src = __lsx_vldx(lumSrc[j], n);
 
 1067             l_src = __lsx_vilvl_h(l_src, l_src);
 
 1068             y_ev  = __lsx_vmaddwev_w_h(y_ev, l_src, 
temp);
 
 1070         for (j = 0; j < chrFilterSize; j++) {
 
 1071             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
 1072             DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
 
 1073             uv    = __lsx_vilvl_h(v_src, u_src);
 
 1074             u_ev  = __lsx_vmaddwev_w_h(u_ev, uv, 
temp);
 
 1075             v_ev  = __lsx_vmaddwod_w_h(v_ev, uv, 
temp);
 
 1077         y_ev = __lsx_vsrai_w(y_ev, 10);
 
 1078         u_ev = __lsx_vsrai_w(u_ev, 10);
 
 1079         v_ev = __lsx_vsrai_w(v_ev, 10);
 
 1081                      y_temp, v2r, v2g, u2g, u2b);
 
 1084             __m128i a_src, a_ev;
 
 1086             a_ev = __lsx_vreplgr2vr_w(a_temp);
 
 1087             for (j = 0; j < lumFilterSize; j++) {
 
 1088                 temp  = __lsx_vldrepl_h(lumFilter + j, 0);
 
 1089                 a_src = __lsx_vldx(alpSrc[j], n);
 
 1090                 a_src = __lsx_vilvl_h(a_src, a_src);
 
 1091                 a_ev  =  __lsx_vmaddwev_w_h(a_ev, a_src, 
temp);
 
 1093             a_ev = __lsx_vsrai_w(a_ev, 19);
 
 1106     for (; 
i < dstW; 
i++) {
 
 1108         int V, 
U = 
V = tempc;
 
 1111         for (j = 0; j < lumFilterSize; j++) {
 
 1112             Y += lumSrc[j][
i] * lumFilter[j];
 
 1114         for (j = 0; j < chrFilterSize; j++) {
 
 1115             U += chrUSrc[j][
i] * chrFilter[j];
 
 1116             V += chrVSrc[j][
i] * chrFilter[j];
 
 1124             for (j = 0; j < lumFilterSize; j++) {
 
 1125                 A += alpSrc[j][
i] * lumFilter[j];
 
 1134         R  = (unsigned)
Y + 
V * v2r_coe;
 
 1135         G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1136         B  = (unsigned)
Y + 
U * u2b_coe;
 
 1137         yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1140     c->dither_error[0][
i] = err[0];
 
 1141     c->dither_error[1][
i] = err[1];
 
 1142     c->dither_error[2][
i] = err[2];
 
 1147                             const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1148                             const int16_t *abuf[2], uint8_t *dest, 
int dstW,
 
 1149                             int yalpha, 
int uvalpha, 
int y,
 
 1152     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
 1153                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 
 1154                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
 
 1155                   *abuf0 = hasAlpha ? abuf[0] : 
NULL,
 
 1156                   *abuf1 = hasAlpha ? abuf[1] : 
NULL;
 
 1157     int yalpha1  = 4096 - yalpha;
 
 1158     int uvalpha1 = 4096 - uvalpha;
 
 1159     int uvtemp   = 128 << 19;
 
 1160     int atemp    = 1 << 18;
 
 1162     int ytemp    = 1 << 21;
 
 1167     __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
 
 1168     __m128i v_yalpha1  = __lsx_vreplgr2vr_w(yalpha1);
 
 1169     __m128i v_uvalpha  = __lsx_vreplgr2vr_w(uvalpha);
 
 1170     __m128i v_yalpha   = __lsx_vreplgr2vr_w(yalpha);
 
 1171     __m128i uv         = __lsx_vreplgr2vr_w(uvtemp);
 
 1172     __m128i a_bias     = __lsx_vreplgr2vr_w(atemp);
 
 1173     __m128i y_temp     = __lsx_vreplgr2vr_w(ytemp);
 
 1183     for (
i = 0; 
i < 
len; 
i += 8) {
 
 1184         __m128i 
b0, 
b1, ub0, ub1, vb0, vb1;
 
 1185         __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
 
 1186         __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
 
 1187         __m128i y_l, y_h, v_l, v_h, u_l, u_h;
 
 1188         __m128i R_l, R_h, G_l, G_h, B_l, B_h;
 
 1191         DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
 
 1192                   n, ubuf1, n, 
b0, 
b1, ub0, ub1);
 
 1193         DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
 
 1195         DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
 
 1196                   u0_l, u1_l, v0_l, v1_l);
 
 1198         DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
 
 1199                   u0_h, u1_h, v0_h, v1_h);
 
 1200         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
 1201         y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
 
 1202         u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
 
 1203         u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
 
 1204         v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
 
 1205         v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
 
 1206         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
 1207         y_h  = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
 
 1208         u_l  = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
 
 1209         u_h  = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
 
 1210         v_l  = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
 
 1211         v_h  = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
 
 1212         u_l  = __lsx_vsub_w(u_l, uv);
 
 1213         u_h  = __lsx_vsub_w(u_h, uv);
 
 1214         v_l  = __lsx_vsub_w(v_l, uv);
 
 1215         v_h  = __lsx_vsub_w(v_h, uv);
 
 1216         y_l  = __lsx_vsrai_w(y_l, 10);
 
 1217         y_h  = __lsx_vsrai_w(y_h, 10);
 
 1218         u_l  = __lsx_vsrai_w(u_l, 10);
 
 1219         u_h  = __lsx_vsrai_w(u_h, 10);
 
 1220         v_l  = __lsx_vsrai_w(v_l, 10);
 
 1221         v_h  = __lsx_vsrai_w(v_h, 10);
 
 1223                      y_temp, v2r, v2g, u2g, u2b);
 
 1225                      y_temp, v2r, v2g, u2g, u2b);
 
 1228             __m128i 
a0, 
a1, a0_l, a0_h;
 
 1229             __m128i a_l, a_h, a1_l, a1_h;
 
 1234             a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
 
 1235             a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
 
 1236             a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
 
 1237             a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
 
 1238             a_l = __lsx_vsrai_w(a_l, 19);
 
 1239             a_h = __lsx_vsrai_w(a_h, 19);
 
 1259     if (dstW - 
i >= 4) {
 
 1260         __m128i 
b0, 
b1, ub0, ub1, vb0, vb1;
 
 1261         __m128i y0_l, y1_l, u0_l;
 
 1262         __m128i v0_l, u1_l, v1_l;
 
 1263         __m128i y_l, u_l, v_l;
 
 1264         __m128i R_l, G_l, B_l;
 
 1267         DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
 
 1268                   ubuf1, n, 
b0, 
b1, ub0, ub1);
 
 1269         DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
 
 1271         DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
 
 1272                   u0_l, u1_l, v0_l, v1_l);
 
 1273         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
 1274         u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
 
 1275         v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
 
 1276         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
 1277         u_l  = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
 
 1278         v_l  = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
 
 1279         u_l  = __lsx_vsub_w(u_l, uv);
 
 1280         v_l  = __lsx_vsub_w(v_l, uv);
 
 1281         y_l  = __lsx_vsrai_w(y_l, 10);
 
 1282         u_l  = __lsx_vsrai_w(u_l, 10);
 
 1283         v_l  = __lsx_vsrai_w(v_l, 10);
 
 1285                      y_temp, v2r, v2g, u2g, u2b);
 
 1288             __m128i 
a0, 
a1, a0_l;
 
 1293             a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
 
 1294             a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
 
 1295             a_l = __lsx_vsrai_w(a_l, 19);
 
 1308     for (; 
i < dstW; 
i++){
 
 1309         int Y = ( buf0[
i] * yalpha1  +  buf1[
i] * yalpha         ) >> 10;
 
 1310         int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
 
 1311         int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
 
 1315             A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
 
 1323         R  = (unsigned)
Y + 
V * v2r_coe;
 
 1324         G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1325         B  = (unsigned)
Y + 
U * u2b_coe;
 
 1326         yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1329     c->dither_error[0][
i] = err[0];
 
 1330     c->dither_error[1][
i] = err[1];
 
 1331     c->dither_error[2][
i] = err[2];
 
 1336                             const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1337                             const int16_t *abuf0, uint8_t *dest, 
int dstW,
 
 1341     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
 
 1345     int ytemp      = 1 << 21;
 
 1348     __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
 
 1354     if (uvalpha < 2048) {
 
 1355         int uvtemp   = 128 << 7;
 
 1356         __m128i uv   = __lsx_vreplgr2vr_w(uvtemp);
 
 1357         __m128i 
bias = __lsx_vreplgr2vr_w(bias_int);
 
 1359         for (
i = 0; 
i < 
len; 
i += 8) {
 
 1360             __m128i 
b, 
ub, vb, ub_l, ub_h, vb_l, vb_h;
 
 1361             __m128i y_l, y_h, u_l, u_h, v_l, v_h;
 
 1362             __m128i R_l, R_h, G_l, G_h, B_l, B_h;
 
 1366             vb  = __lsx_vldx(vbuf0, n);
 
 1367             y_l = __lsx_vsllwil_w_h(
b, 2);
 
 1368             y_h = __lsx_vexth_w_h(
b);
 
 1369             DUP2_ARG2(__lsx_vsllwil_w_h, 
ub, 0, vb, 0, ub_l, vb_l);
 
 1371             y_h = __lsx_vslli_w(y_h, 2);
 
 1372             u_l = __lsx_vsub_w(ub_l, uv);
 
 1373             u_h = __lsx_vsub_w(ub_h, uv);
 
 1374             v_l = __lsx_vsub_w(vb_l, uv);
 
 1375             v_h = __lsx_vsub_w(vb_h, uv);
 
 1376             u_l = __lsx_vslli_w(u_l, 2);
 
 1377             u_h = __lsx_vslli_w(u_h, 2);
 
 1378             v_l = __lsx_vslli_w(v_l, 2);
 
 1379             v_h = __lsx_vslli_w(v_h, 2);
 
 1381                          y_temp, v2r, v2g, u2g, u2b);
 
 1383                          y_temp, v2r, v2g, u2g, u2b);
 
 1389                 a_src = __lsx_vld(abuf0 + 
i, 0);
 
 1390                 a_l   = __lsx_vsllwil_w_h(a_src, 0);
 
 1391                 a_h   = __lsx_vexth_w_h(a_src);
 
 1392                 a_l   = __lsx_vadd_w(a_l, 
bias);
 
 1393                 a_h   = __lsx_vadd_w(a_h, 
bias);
 
 1394                 a_l   = __lsx_vsrai_w(a_l, 7);
 
 1395                 a_h   = __lsx_vsrai_w(a_h, 7);
 
 1415         if (dstW - 
i >= 4) {
 
 1416             __m128i 
b, 
ub, vb, ub_l, vb_l;
 
 1417             __m128i y_l, u_l, v_l;
 
 1418             __m128i R_l, G_l, B_l;
 
 1422             vb  = __lsx_vldx(vbuf0, n);
 
 1423             y_l = __lsx_vsllwil_w_h(
b, 0);
 
 1424             DUP2_ARG2(__lsx_vsllwil_w_h, 
ub, 0, vb, 0, ub_l, vb_l);
 
 1425             y_l = __lsx_vslli_w(y_l, 2);
 
 1426             u_l = __lsx_vsub_w(ub_l, uv);
 
 1427             v_l = __lsx_vsub_w(vb_l, uv);
 
 1428             u_l = __lsx_vslli_w(u_l, 2);
 
 1429             v_l = __lsx_vslli_w(v_l, 2);
 
 1431                          y_temp, v2r, v2g, u2g, u2b);
 
 1436                 a_src = __lsx_vldx(abuf0, n);
 
 1437                 a_src = __lsx_vsllwil_w_h(a_src, 0);
 
 1438                 a_l   = __lsx_vadd_w(
bias, a_src);
 
 1439                 a_l   = __lsx_vsrai_w(a_l, 7);
 
 1452         for (; 
i < dstW; 
i++) {
 
 1453             int Y = buf0[
i] << 2;
 
 1454             int U = (ubuf0[
i] - uvtemp) << 2;
 
 1455             int V = (vbuf0[
i] - uvtemp) << 2;
 
 1459                 A = (abuf0[
i] + 64) >> 7;
 
 1466             R  = (unsigned)
Y + 
V * v2r_coe;
 
 1467             G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1468             B  = (unsigned)
Y + 
U * u2b_coe;
 
 1469             yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1473         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
 
 1474         int uvtemp   = 128 << 8;
 
 1475         __m128i uv   = __lsx_vreplgr2vr_w(uvtemp);
 
 1476         __m128i 
zero = __lsx_vldi(0);
 
 1477         __m128i 
bias = __lsx_vreplgr2vr_h(bias_int);
 
 1479         for (
i = 0; 
i < 
len; 
i += 8) {
 
 1480             __m128i 
b, ub0, ub1, vb0, vb1;
 
 1481             __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
 
 1482             __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
 
 1485             DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
 
 1486                       ubuf1, n, 
b, ub0, vb0, ub1);
 
 1487             vb1 = __lsx_vldx(vbuf, n);
 
 1488             y_ev = __lsx_vaddwev_w_h(
b, 
zero);
 
 1489             y_od = __lsx_vaddwod_w_h(
b, 
zero);
 
 1490             DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
 
 1491             DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
 
 1492             DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
 
 1493             DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
 
 1494                       u_ev, u_od, v_ev, v_od);
 
 1495             DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
 
 1496                       u_ev, u_od, v_ev, v_od);
 
 1498                          y_temp, v2r, v2g, u2g, u2b);
 
 1500                          y_temp, v2r, v2g, u2g, u2b);
 
 1506                 a_src = __lsx_vld(abuf0 + 
i, 0);
 
 1507                 a_ev  = __lsx_vaddwev_w_h(
bias, a_src);
 
 1508                 a_od  = __lsx_vaddwod_w_h(
bias, a_src);
 
 1509                 a_ev  = __lsx_vsrai_w(a_ev, 7);
 
 1510                 a_od  = __lsx_vsrai_w(a_od, 7);
 
 1530         if (dstW - 
i >= 4) {
 
 1531             __m128i 
b, ub0, ub1, vb0, vb1;
 
 1532             __m128i y_l, u_l, v_l;
 
 1533             __m128i R_l, G_l, B_l;
 
 1536             DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
 
 1537                       ubuf1, n, 
b, ub0, vb0, ub1);
 
 1538             vb1 = __lsx_vldx(vbuf1, n);
 
 1539             y_l = __lsx_vsllwil_w_h(
b, 0);
 
 1540             y_l = __lsx_vslli_w(y_l, 2);
 
 1541             DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
 
 1542                       ub0, vb0, ub1, vb1);
 
 1543             DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
 
 1544             u_l = __lsx_vsub_w(u_l, uv);
 
 1545             v_l = __lsx_vsub_w(v_l, uv);
 
 1546             u_l = __lsx_vslli_w(u_l, 1);
 
 1547             v_l = __lsx_vslli_w(v_l, 1);
 
 1549                          y_temp, v2r, v2g, u2g, u2b);
 
 1555                 a_src  = __lsx_vld(abuf0 + 
i, 0);
 
 1556                 a_src  = __lsx_vilvl_h(a_src, a_src);
 
 1557                 a_l    = __lsx_vaddwev_w_h(
bias, a_l);
 
 1558                 a_l   = __lsx_vsrai_w(a_l, 7);
 
 1571         for (; 
i < dstW; 
i++) {
 
 1572             int Y = buf0[
i] << 2;
 
 1573             int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
 
 1574             int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
 
 1578                 A = (abuf0[
i] + 64) >> 7;
 
 1585             R  = (unsigned)
Y + 
V * v2r_coe;
 
 1586             G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1587             B  = (unsigned)
Y + 
U * u2b_coe;
 
 1588             yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1592     c->dither_error[0][
i] = err[0];
 
 1593     c->dither_error[1][
i] = err[1];
 
 1594     c->dither_error[2][
i] = err[2];
 
 1599                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1601                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1603                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1605                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1607 #if CONFIG_SWSCALE_ALPHA 
 1630         switch (
c->dstFormat) {
 
 1633             c->yuv2packedX = yuv2rgba32_full_X_lsx;
 
 1634             c->yuv2packed2 = yuv2rgba32_full_2_lsx;
 
 1635             c->yuv2packed1 = yuv2rgba32_full_1_lsx;
 
 1637 #if CONFIG_SWSCALE_ALPHA 
 1639                 c->yuv2packedX = yuv2rgba32_full_X_lsx;
 
 1640                 c->yuv2packed2 = yuv2rgba32_full_2_lsx;
 
 1641                 c->yuv2packed1 = yuv2rgba32_full_1_lsx;
 
 1645                 c->yuv2packedX = yuv2rgbx32_full_X_lsx;
 
 1646                 c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
 
 1647                 c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
 
 1653             c->yuv2packedX = yuv2argb32_full_X_lsx;
 
 1654             c->yuv2packed2 = yuv2argb32_full_2_lsx;
 
 1655             c->yuv2packed1 = yuv2argb32_full_1_lsx;
 
 1657 #if CONFIG_SWSCALE_ALPHA 
 1659                 c->yuv2packedX = yuv2argb32_full_X_lsx;
 
 1660                 c->yuv2packed2 = yuv2argb32_full_2_lsx;
 
 1661                 c->yuv2packed1 = yuv2argb32_full_1_lsx;
 
 1665                 c->yuv2packedX = yuv2xrgb32_full_X_lsx;
 
 1666                 c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
 
 1667                 c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
 
 1673             c->yuv2packedX = yuv2bgra32_full_X_lsx;
 
 1674             c->yuv2packed2 = yuv2bgra32_full_2_lsx;
 
 1675             c->yuv2packed1 = yuv2bgra32_full_1_lsx;
 
 1677 #if CONFIG_SWSCALE_ALPHA 
 1679                 c->yuv2packedX = yuv2bgra32_full_X_lsx;
 
 1680                 c->yuv2packed2 = yuv2bgra32_full_2_lsx;
 
 1681                 c->yuv2packed1 = yuv2bgra32_full_1_lsx;
 
 1685                 c->yuv2packedX = yuv2bgrx32_full_X_lsx;
 
 1686                 c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
 
 1687                 c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
 
 1693             c->yuv2packedX = yuv2abgr32_full_X_lsx;
 
 1694             c->yuv2packed2 = yuv2abgr32_full_2_lsx;
 
 1695             c->yuv2packed1 = yuv2abgr32_full_1_lsx;
 
 1697 #if CONFIG_SWSCALE_ALPHA 
 1699                 c->yuv2packedX = yuv2abgr32_full_X_lsx;
 
 1700                 c->yuv2packed2 = yuv2abgr32_full_2_lsx;
 
 1701                 c->yuv2packed1 = yuv2abgr32_full_1_lsx;
 
 1705                 c->yuv2packedX = yuv2xbgr32_full_X_lsx;
 
 1706                 c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
 
 1707                 c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
 
 1712             c->yuv2packedX = yuv2rgb24_full_X_lsx;
 
 1713             c->yuv2packed2 = yuv2rgb24_full_2_lsx;
 
 1714             c->yuv2packed1 = yuv2rgb24_full_1_lsx;
 
 1717             c->yuv2packedX = yuv2bgr24_full_X_lsx;
 
 1718             c->yuv2packed2 = yuv2bgr24_full_2_lsx;
 
 1719             c->yuv2packed1 = yuv2bgr24_full_1_lsx;
 
 1722             c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
 
 1723             c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
 
 1724             c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
 
 1727             c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
 
 1728             c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
 
 1729             c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
 
 1732             c->yuv2packedX = yuv2bgr8_full_X_lsx;
 
 1733             c->yuv2packed2 = yuv2bgr8_full_2_lsx;
 
 1734             c->yuv2packed1 = yuv2bgr8_full_1_lsx;
 
 1737             c->yuv2packedX = yuv2rgb8_full_X_lsx;
 
 1738             c->yuv2packed2 = yuv2rgb8_full_2_lsx;
 
 1739             c->yuv2packed1 = yuv2rgb8_full_1_lsx;
 
 1743         switch (
c->dstFormat) {
 
 1748 #if CONFIG_SWSCALE_ALPHA 
 1753                 c->yuv2packed1 = yuv2rgbx32_1_lsx;
 
 1754                 c->yuv2packed2 = yuv2rgbx32_2_lsx;
 
 1755                 c->yuv2packedX = yuv2rgbx32_X_lsx;
 
 1763 #if CONFIG_SWSCALE_ALPHA 
 1768                 c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
 
 1769                 c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
 
 1770                 c->yuv2packedX = yuv2rgbx32_1_X_lsx;
 
 1775             c->yuv2packed1 = yuv2rgb24_1_lsx;
 
 1776             c->yuv2packed2 = yuv2rgb24_2_lsx;
 
 1777             c->yuv2packedX = yuv2rgb24_X_lsx;
 
 1780             c->yuv2packed1 = yuv2bgr24_1_lsx;
 
 1781             c->yuv2packed2 = yuv2bgr24_2_lsx;
 
 1782             c->yuv2packedX = yuv2bgr24_X_lsx;
 
 1788             c->yuv2packed1 = yuv2rgb16_1_lsx;
 
 1789             c->yuv2packed2 = yuv2rgb16_2_lsx;
 
 1790             c->yuv2packedX = yuv2rgb16_X_lsx;
 
 1796             c->yuv2packed1 = yuv2rgb15_1_lsx;
 
 1797             c->yuv2packed2 = yuv2rgb15_2_lsx;
 
 1798             c->yuv2packedX = yuv2rgb15_X_lsx;
 
 1804             c->yuv2packed1 = yuv2rgb12_1_lsx;
 
 1805             c->yuv2packed2 = yuv2rgb12_2_lsx;
 
 1806             c->yuv2packedX = yuv2rgb12_X_lsx;
 
 1810             c->yuv2packed1 = yuv2rgb8_1_lsx;
 
 1811             c->yuv2packed2 = yuv2rgb8_2_lsx;
 
 1812             c->yuv2packedX = yuv2rgb8_X_lsx;
 
 1816             c->yuv2packed1 = yuv2rgb4_1_lsx;
 
 1817             c->yuv2packed2 = yuv2rgb4_2_lsx;
 
 1818             c->yuv2packedX = yuv2rgb4_X_lsx;
 
 1822             c->yuv2packed1 = yuv2rgb4b_1_lsx;
 
 1823             c->yuv2packed2 = yuv2rgb4b_2_lsx;
 
 1824             c->yuv2packedX = yuv2rgb4b_X_lsx;