Go to the documentation of this file.
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
34 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
35 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
36 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
38 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
39 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
40 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
42 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
43 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
44 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
46 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
47 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
48 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
50 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
51 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
52 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
54 #if (__mips_isa_rev >= 6)
57 uint8_t *psrc_m = (uint8_t *) (psrc); \
61 "lw %[val_m], %[psrc_m] \n\t" \
63 : [val_m] "=r" (val_m) \
64 : [psrc_m] "m" (*psrc_m) \
73 uint8_t *psrc_m = (uint8_t *) (psrc); \
77 "ld %[val_m], %[psrc_m] \n\t" \
79 : [val_m] "=r" (val_m) \
80 : [psrc_m] "m" (*psrc_m) \
85 #else // !(__mips == 64)
88 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
89 uint32_t val0_m, val1_m; \
92 val0_m = LW(psrc_ld_m); \
93 val1_m = LW(psrc_ld_m + 4); \
95 val_m = (uint64_t) (val1_m); \
96 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
97 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
101 #endif // (__mips == 64)
103 #define SH(val, pdst) \
105 uint8_t *pdst_m = (uint8_t *) (pdst); \
106 uint16_t val_m = (val); \
109 "sh %[val_m], %[pdst_m] \n\t" \
111 : [pdst_m] "=m" (*pdst_m) \
112 : [val_m] "r" (val_m) \
116 #define SW(val, pdst) \
118 uint8_t *pdst_m = (uint8_t *) (pdst); \
119 uint32_t val_m = (val); \
122 "sw %[val_m], %[pdst_m] \n\t" \
124 : [pdst_m] "=m" (*pdst_m) \
125 : [val_m] "r" (val_m) \
129 #define SD(val, pdst) \
131 uint8_t *pdst_m = (uint8_t *) (pdst); \
132 uint64_t val_m = (val); \
135 "sd %[val_m], %[pdst_m] \n\t" \
137 : [pdst_m] "=m" (*pdst_m) \
138 : [val_m] "r" (val_m) \
141 #else // !(__mips_isa_rev >= 6)
144 uint8_t *psrc_m = (uint8_t *) (psrc); \
148 "ulw %[val_m], %[psrc_m] \n\t" \
150 : [val_m] "=r" (val_m) \
151 : [psrc_m] "m" (*psrc_m) \
160 uint8_t *psrc_m = (uint8_t *) (psrc); \
161 uint64_t val_m = 0; \
164 "uld %[val_m], %[psrc_m] \n\t" \
166 : [val_m] "=r" (val_m) \
167 : [psrc_m] "m" (*psrc_m) \
172 #else // !(__mips == 64)
175 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
176 uint32_t val0_m, val1_m; \
177 uint64_t val_m = 0; \
179 val0_m = LW(psrc_ld_m); \
180 val1_m = LW(psrc_ld_m + 4); \
182 val_m = (uint64_t) (val1_m); \
183 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
184 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
188 #endif // (__mips == 64)
190 #define SH(val, pdst) \
192 uint8_t *pdst_m = (uint8_t *) (pdst); \
193 uint16_t val_m = (val); \
196 "ush %[val_m], %[pdst_m] \n\t" \
198 : [pdst_m] "=m" (*pdst_m) \
199 : [val_m] "r" (val_m) \
203 #define SW(val, pdst) \
205 uint8_t *pdst_m = (uint8_t *) (pdst); \
206 uint32_t val_m = (val); \
209 "usw %[val_m], %[pdst_m] \n\t" \
211 : [pdst_m] "=m" (*pdst_m) \
212 : [val_m] "r" (val_m) \
216 #define SD(val, pdst) \
218 uint8_t *pdst_m1 = (uint8_t *) (pdst); \
219 uint32_t val0_m, val1_m; \
221 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
222 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
224 SW(val0_m, pdst_m1); \
225 SW(val1_m, pdst_m1 + 4); \
227 #endif // (__mips_isa_rev >= 6)
238 #define LW4(psrc, stride, out0, out1, out2, out3) \
241 out1 = LW((psrc) + stride); \
242 out2 = LW((psrc) + 2 * stride); \
243 out3 = LW((psrc) + 3 * stride); \
253 #define LD2(psrc, stride, out0, out1) \
256 out1 = LD((psrc) + stride); \
258 #define LD4(psrc, stride, out0, out1, out2, out3) \
260 LD2((psrc), stride, out0, out1); \
261 LD2((psrc) + 2 * stride, stride, out2, out3); \
271 #define SW4(in0, in1, in2, in3, pdst, stride) \
274 SW(in1, (pdst) + stride); \
275 SW(in2, (pdst) + 2 * stride); \
276 SW(in3, (pdst) + 3 * stride); \
286 #define SD4(in0, in1, in2, in3, pdst, stride) \
289 SD(in1, (pdst) + stride); \
290 SD(in2, (pdst) + 2 * stride); \
291 SD(in3, (pdst) + 3 * stride); \
302 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
304 out0 = LD_B(RTYPE, (psrc)); \
305 out1 = LD_B(RTYPE, (psrc) + stride); \
307 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
308 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
310 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
312 LD_B2(RTYPE, (psrc), stride, out0, out1); \
313 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
315 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
316 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
318 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
320 LD_B2(RTYPE, (psrc), stride, out0, out1); \
321 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
323 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
324 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
326 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
328 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
329 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
331 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
332 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
334 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
336 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
337 LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
339 #define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__)
340 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
342 #define LD_B7(RTYPE, psrc, stride, \
343 out0, out1, out2, out3, out4, out5, out6) \
345 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
346 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
348 #define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__)
349 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
351 #define LD_B8(RTYPE, psrc, stride, \
352 out0, out1, out2, out3, out4, out5, out6, out7) \
354 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
355 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
357 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
358 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
367 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
369 out0 = LD_H(RTYPE, (psrc)); \
370 out1 = LD_H(RTYPE, (psrc) + (stride)); \
372 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
373 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
375 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
377 LD_H2(RTYPE, (psrc), stride, out0, out1); \
378 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
380 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
381 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
383 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
385 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
386 LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
388 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
389 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
391 #define LD_H8(RTYPE, psrc, stride, \
392 out0, out1, out2, out3, out4, out5, out6, out7) \
394 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
395 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
397 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
398 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
400 #define LD_H16(RTYPE, psrc, stride, \
401 out0, out1, out2, out3, out4, out5, out6, out7, \
402 out8, out9, out10, out11, out12, out13, out14, out15) \
404 LD_H8(RTYPE, (psrc), stride, \
405 out0, out1, out2, out3, out4, out5, out6, out7); \
406 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
407 out8, out9, out10, out11, out12, out13, out14, out15); \
409 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
416 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
418 out0 = LD_SH(psrc); \
419 out2 = LD_SH(psrc + 8); \
420 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
421 out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
430 #define LD_SW2(psrc, stride, out0, out1) \
432 out0 = LD_SW((psrc)); \
433 out1 = LD_SW((psrc) + stride); \
442 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
444 ST_B(RTYPE, in0, (pdst)); \
445 ST_B(RTYPE, in1, (pdst) + stride); \
447 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
448 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
450 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
452 ST_B2(RTYPE, in0, in1, (pdst), stride); \
453 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
455 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
456 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
458 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
461 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
462 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
464 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
472 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
474 ST_H(RTYPE, in0, (pdst)); \
475 ST_H(RTYPE, in1, (pdst) + stride); \
477 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
478 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
480 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
482 ST_H2(RTYPE, in0, in1, (pdst), stride); \
483 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
485 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
487 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
489 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
490 ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
492 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
494 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
496 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
497 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
499 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
508 #define ST_SW2(in0, in1, pdst, stride) \
510 ST_SW(in0, (pdst)); \
511 ST_SW(in1, (pdst) + stride); \
513 #define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7, \
516 ST_SW2(in0, in1, (pdst), stride); \
517 ST_SW2(in2, in3, (pdst) + 2 * stride, stride); \
518 ST_SW2(in4, in5, (pdst) + 4 * stride, stride); \
519 ST_SW2(in6, in7, (pdst) + 6 * stride, stride); \
534 #define ST2x4_UB(in, stidx, pdst, stride) \
536 uint16_t out0_m, out1_m, out2_m, out3_m; \
537 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
539 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
540 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
541 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
542 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
544 SH(out0_m, pblk_2x4_m); \
545 SH(out1_m, pblk_2x4_m + stride); \
546 SH(out2_m, pblk_2x4_m + 2 * stride); \
547 SH(out3_m, pblk_2x4_m + 3 * stride); \
558 #define ST4x2_UB(in, pdst, stride) \
560 uint32_t out0_m, out1_m; \
561 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
563 out0_m = __msa_copy_u_w((v4i32) in, 0); \
564 out1_m = __msa_copy_u_w((v4i32) in, 1); \
566 SW(out0_m, pblk_4x2_m); \
567 SW(out1_m, pblk_4x2_m + stride); \
582 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
584 uint32_t out0_m, out1_m, out2_m, out3_m; \
585 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
587 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
588 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
589 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
590 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
592 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
594 #define ST4x8_UB(in0, in1, pdst, stride) \
596 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
598 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
599 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
615 #define ST6x4_UB(in0, in1, pdst, stride) \
617 uint32_t out0_m, out1_m, out2_m, out3_m; \
618 uint16_t out4_m, out5_m, out6_m, out7_m; \
619 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
621 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
622 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
623 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
624 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
626 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
627 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
628 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
629 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
631 SW(out0_m, pblk_6x4_m); \
632 SH(out4_m, (pblk_6x4_m + 4)); \
633 pblk_6x4_m += stride; \
634 SW(out1_m, pblk_6x4_m); \
635 SH(out5_m, (pblk_6x4_m + 4)); \
636 pblk_6x4_m += stride; \
637 SW(out2_m, pblk_6x4_m); \
638 SH(out6_m, (pblk_6x4_m + 4)); \
639 pblk_6x4_m += stride; \
640 SW(out3_m, pblk_6x4_m); \
641 SH(out7_m, (pblk_6x4_m + 4)); \
649 #define ST8x1_UB(in, pdst) \
652 out0_m = __msa_copy_u_d((v2i64) in, 0); \
663 #define ST8x2_UB(in, pdst, stride) \
665 uint64_t out0_m, out1_m; \
666 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
668 out0_m = __msa_copy_u_d((v2i64) in, 0); \
669 out1_m = __msa_copy_u_d((v2i64) in, 1); \
671 SD(out0_m, pblk_8x2_m); \
672 SD(out1_m, pblk_8x2_m + stride); \
687 #define ST8x4_UB(in0, in1, pdst, stride) \
689 uint64_t out0_m, out1_m, out2_m, out3_m; \
690 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
692 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
693 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
694 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
695 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
697 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
699 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
701 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
703 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
704 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
706 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
708 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
711 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
713 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
725 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
727 uint64_t out0_m, out1_m, out2_m, out3_m; \
728 uint64_t out4_m, out5_m, out6_m, out7_m; \
729 uint32_t out8_m, out9_m, out10_m, out11_m; \
730 uint32_t out12_m, out13_m, out14_m, out15_m; \
731 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
733 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
734 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
735 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
736 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
737 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
738 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
739 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
740 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
742 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
743 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
744 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
745 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
746 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
747 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
748 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
749 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
751 SD(out0_m, pblk_12x8_m); \
752 SW(out8_m, pblk_12x8_m + 8); \
753 pblk_12x8_m += stride; \
754 SD(out1_m, pblk_12x8_m); \
755 SW(out9_m, pblk_12x8_m + 8); \
756 pblk_12x8_m += stride; \
757 SD(out2_m, pblk_12x8_m); \
758 SW(out10_m, pblk_12x8_m + 8); \
759 pblk_12x8_m += stride; \
760 SD(out3_m, pblk_12x8_m); \
761 SW(out11_m, pblk_12x8_m + 8); \
762 pblk_12x8_m += stride; \
763 SD(out4_m, pblk_12x8_m); \
764 SW(out12_m, pblk_12x8_m + 8); \
765 pblk_12x8_m += stride; \
766 SD(out5_m, pblk_12x8_m); \
767 SW(out13_m, pblk_12x8_m + 8); \
768 pblk_12x8_m += stride; \
769 SD(out6_m, pblk_12x8_m); \
770 SW(out14_m, pblk_12x8_m + 8); \
771 pblk_12x8_m += stride; \
772 SD(out7_m, pblk_12x8_m); \
773 SW(out15_m, pblk_12x8_m + 8); \
788 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
790 out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
791 out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
793 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
795 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
796 out0, out1, out2, out3) \
798 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
799 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
801 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
810 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
812 v16i8 zero_m = { 0 }; \
813 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
814 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
816 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
817 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
818 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
820 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
822 v16i8 zero_m = { 0 }; \
823 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
824 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
826 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
827 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
829 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
830 out0, out1, out2, out3, slide_val) \
832 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
833 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
835 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
836 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
837 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
846 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
848 out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
849 out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
851 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
852 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
853 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
855 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
856 out0, out1, out2, slide_val) \
858 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
859 out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
861 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
862 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
873 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
875 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
876 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
878 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
879 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
880 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
881 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
883 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
886 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
887 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
889 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
891 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
892 out0, out1, out2, out3) \
894 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
895 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
897 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
898 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
909 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
911 out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
912 out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
914 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
916 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
919 VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
920 out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
922 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
933 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
935 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
936 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
938 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
952 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
954 out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
955 out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
957 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
959 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
960 cnst0, cnst1, cnst2, cnst3, \
961 out0, out1, out2, out3) \
963 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
964 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
966 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
980 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
982 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
983 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
985 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
987 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
990 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
991 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
993 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
995 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
996 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
998 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
999 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1001 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
1015 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1017 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
1018 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
1020 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
1022 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
1023 cnst0, cnst1, cnst2, cnst3, \
1024 out0, out1, out2, out3) \
1026 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1027 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1029 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
1043 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1045 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
1046 (v16i8) mult0, (v16i8) cnst0); \
1047 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
1048 (v16i8) mult1, (v16i8) cnst1); \
1050 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
1052 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
1053 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
1055 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1056 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1058 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
1072 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1074 out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
1075 (v16u8) mult0, (v16u8) cnst0); \
1076 out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
1077 (v16u8) mult1, (v16u8) cnst1); \
1079 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
1093 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1095 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
1096 (v8i16) mult0, (v8i16) cnst0); \
1097 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
1098 (v8i16) mult1, (v8i16) cnst1); \
1100 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
1102 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
1103 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
1105 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1106 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1108 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
1118 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
1120 in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
1121 in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
1123 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1125 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1127 MIN_UH2(RTYPE, in0, in1, min_vec); \
1128 MIN_UH2(RTYPE, in2, in3, min_vec); \
1130 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1140 #define CLIP_SH(in, min, max) \
1144 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1145 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1155 #define CLIP_SH_0_255(in) \
1157 v8i16 max_m = __msa_ldi_h(255); \
1160 out_m = __msa_maxi_s_h((v8i16) in, 0); \
1161 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1164 #define CLIP_SH2_0_255(in0, in1) \
1166 in0 = CLIP_SH_0_255(in0); \
1167 in1 = CLIP_SH_0_255(in1); \
1169 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1171 CLIP_SH2_0_255(in0, in1); \
1172 CLIP_SH2_0_255(in2, in3); \
1181 #define CLIP_SW_0_255(in) \
1183 v4i32 max_m = __msa_ldi_w(255); \
1186 out_m = __msa_maxi_s_w((v4i32) in, 0); \
1187 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1198 #define HADD_SW_S32(in) \
1200 v2i64 res0_m, res1_m; \
1203 res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1204 res1_m = __msa_splati_d(res0_m, 1); \
1205 res0_m = res0_m + res1_m; \
1206 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1217 #define HADD_UH_U32(in) \
1220 v2u64 res0_m, res1_m; \
1223 res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1224 res0_m = __msa_hadd_u_d(res_m, res_m); \
1225 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1226 res0_m = res0_m + res1_m; \
1227 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1239 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1241 out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1242 out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1244 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1246 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1248 HADD_SB2(RTYPE, in0, in1, out0, out1); \
1249 HADD_SB2(RTYPE, in2, in3, out2, out3); \
1251 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1252 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1262 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1264 out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1265 out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1267 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1269 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1271 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1272 out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1274 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1276 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1278 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1279 HADD_UB2(RTYPE, in2, in3, out2, out3); \
1281 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1282 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1283 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1293 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1295 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1296 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1298 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1299 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1301 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1303 HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1304 HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1306 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1307 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1318 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1320 v16u8 diff0_m, diff1_m; \
1321 v8u16 sad_m = { 0 }; \
1323 diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1324 diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1326 sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1327 sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1338 #define INSERT_W2(RTYPE, in0, in1, out) \
1340 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1341 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1343 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1344 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1346 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1348 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1349 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1350 out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1351 out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1353 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1354 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1355 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1363 #define INSERT_D2(RTYPE, in0, in1, out) \
1365 out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1366 out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1368 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1369 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1370 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1381 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1383 out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1384 out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1386 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1387 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1388 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1389 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1400 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1402 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1403 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1405 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1406 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1407 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1418 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1420 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1421 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1423 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1424 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1425 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1426 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1437 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1439 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1440 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1442 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1443 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1444 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1455 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1457 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1458 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1460 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1461 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1462 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1463 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1465 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1466 out0, out1, out2, out3) \
1468 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1469 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1471 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1472 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1473 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1474 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1485 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1487 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1488 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1490 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1491 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1493 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1494 out0, out1, out2, out3) \
1496 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1497 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1499 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1500 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1511 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1513 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1514 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1516 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1517 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1518 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1530 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1532 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1533 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1535 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1536 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1537 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1538 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1539 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1541 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1543 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1544 out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1546 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1547 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1548 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1550 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1551 out0, out1, out2, out3) \
1553 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1554 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1556 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1557 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1558 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1559 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1560 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1562 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1563 in8, in9, in10, in11, in12, in13, in14, in15, \
1564 out0, out1, out2, out3, out4, out5, out6, out7) \
1566 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1567 out0, out1, out2, out3); \
1568 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1569 out4, out5, out6, out7); \
1571 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1583 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1585 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1586 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1588 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1589 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1591 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1593 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1594 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1596 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1598 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1599 out0, out1, out2, out3) \
1601 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1602 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1604 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1605 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1607 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1609 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1610 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1612 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1613 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1614 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1616 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1617 out0, out1, out2, out3) \
1619 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1620 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1622 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1623 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1634 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1636 out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1637 out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1639 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1640 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1641 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1643 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1645 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1646 out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1648 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1650 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1651 out0, out1, out2, out3) \
1653 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1654 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1656 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1657 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1668 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1670 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1671 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1673 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1674 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1675 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1676 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1677 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1679 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1681 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1682 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1684 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1685 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1686 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1688 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1690 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1691 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1693 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1694 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1695 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1705 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1707 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1708 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1710 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1711 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1713 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1715 MAXI_SH2(RTYPE, in0, in1, max_val); \
1716 MAXI_SH2(RTYPE, in2, in3, max_val); \
1718 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1730 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1732 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1733 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1735 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1736 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1738 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1740 SAT_UH2(RTYPE, in0, in1, sat_val); \
1741 SAT_UH2(RTYPE, in2, in3, sat_val) \
1743 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1755 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1757 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1758 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1760 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1762 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1764 SAT_SH2(RTYPE, in0, in1, sat_val) \
1765 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1767 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1769 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1771 SAT_SH2(RTYPE, in0, in1, sat_val); \
1772 SAT_SH2(RTYPE, in2, in3, sat_val); \
1774 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1786 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1788 in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1789 in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1791 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1793 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1795 SAT_SW2(RTYPE, in0, in1, sat_val); \
1796 SAT_SW2(RTYPE, in2, in3, sat_val); \
1798 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1809 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1811 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1812 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1814 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1815 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1817 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1820 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1821 out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1823 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1824 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1826 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1827 out0, out1, out2, out3) \
1829 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1830 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1832 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1833 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1846 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1848 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1849 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1851 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1852 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1854 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1856 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1857 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1859 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1860 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1873 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1875 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1876 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1878 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1879 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1880 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1881 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1883 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1885 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1886 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1888 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1889 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1891 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1892 out0, out1, out2, out3) \
1894 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1895 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1897 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1898 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1899 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1900 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1913 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1915 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1916 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1918 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1919 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1921 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1922 out0, out1, out2, out3) \
1924 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1925 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1927 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1928 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1941 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1943 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1944 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1946 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1947 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1948 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1950 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1951 out0, out1, out2, out3) \
1953 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1954 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1956 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1967 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1969 out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1970 out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1972 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1973 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1974 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1988 #define XORI_B2_128(RTYPE, in0, in1) \
1990 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1991 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1993 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1994 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1995 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1997 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1999 XORI_B2_128(RTYPE, in0, in1); \
2000 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
2002 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
2004 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
2006 XORI_B2_128(RTYPE, in0, in1); \
2007 XORI_B2_128(RTYPE, in2, in3); \
2009 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
2010 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
2011 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
2013 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
2015 XORI_B3_128(RTYPE, in0, in1, in2); \
2016 XORI_B2_128(RTYPE, in3, in4); \
2018 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
2020 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
2022 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2023 XORI_B2_128(RTYPE, in4, in5); \
2025 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
2027 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
2029 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2030 XORI_B3_128(RTYPE, in4, in5, in6); \
2032 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
2034 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
2036 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2037 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
2039 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
2050 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
2052 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
2053 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
2055 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2057 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2058 out0, out1, out2, out3) \
2060 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
2061 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
2063 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2064 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2074 #define SLLI_4V(in0, in1, in2, in3, shift) \
2076 in0 = in0 << shift; \
2077 in1 = in1 << shift; \
2078 in2 = in2 << shift; \
2079 in3 = in3 << shift; \
2092 #define SRA_4V(in0, in1, in2, in3, shift) \
2094 in0 = in0 >> shift; \
2095 in1 = in1 >> shift; \
2096 in2 = in2 >> shift; \
2097 in3 = in3 >> shift; \
2110 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2112 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2113 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2114 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2115 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2117 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2130 #define SRAR_H2(RTYPE, in0, in1, shift) \
2132 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2133 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2135 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2136 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2138 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2140 SRAR_H2(RTYPE, in0, in1, shift) \
2141 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2143 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2145 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2147 SRAR_H2(RTYPE, in0, in1, shift) \
2148 SRAR_H2(RTYPE, in2, in3, shift) \
2150 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2151 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2164 #define SRAR_W2(RTYPE, in0, in1, shift) \
2166 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2167 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2169 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2171 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2173 SRAR_W2(RTYPE, in0, in1, shift) \
2174 SRAR_W2(RTYPE, in2, in3, shift) \
2176 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2188 #define SRARI_H2(RTYPE, in0, in1, shift) \
2190 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2191 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2193 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2194 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2196 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2198 SRARI_H2(RTYPE, in0, in1, shift); \
2199 SRARI_H2(RTYPE, in2, in3, shift); \
2201 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2202 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2214 #define SRARI_W2(RTYPE, in0, in1, shift) \
2216 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2217 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2219 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2221 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2223 SRARI_W2(RTYPE, in0, in1, shift); \
2224 SRARI_W2(RTYPE, in2, in3, shift); \
2226 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2227 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2236 #define MUL2(in0, in1, in2, in3, out0, out1) \
2241 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2243 MUL2(in0, in1, in2, in3, out0, out1); \
2244 MUL2(in4, in5, in6, in7, out2, out3); \
2253 #define ADD2(in0, in1, in2, in3, out0, out1) \
2258 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2260 ADD2(in0, in1, in2, in3, out0, out1); \
2261 ADD2(in4, in5, in6, in7, out2, out3); \
2270 #define SUB2(in0, in1, in2, in3, out0, out1) \
2275 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2291 #define UNPCK_R_SH_SW(in, out) \
2295 sign_m = __msa_clti_s_h((v8i16) in, 0); \
2296 out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2310 #define UNPCK_SB_SH(in, out0, out1) \
2314 tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2315 ILVRL_B2_SH(tmp_m, in, out0, out1); \
2325 #define UNPCK_UB_SH(in, out0, out1) \
2327 v16i8 zero_m = { 0 }; \
2329 ILVRL_B2_SH(zero_m, in, out0, out1); \
2343 #define UNPCK_SH_SW(in, out0, out1) \
2347 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2348 ILVRL_H2_SW(tmp_m, in, out0, out1); \
2356 #define SWAP(in0, in1) \
2368 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2382 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2383 out0, out1, out2, out3, out4, out5, out6, out7) \
2401 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2402 in8, in9, in10, in11, in12, in13, in14, in15, \
2403 out0, out1, out2, out3, out4, out5, out6, out7, \
2404 out8, out9, out10, out11, out12, out13, out14, out15) \
2406 out0 = in0 + in15; \
2407 out1 = in1 + in14; \
2408 out2 = in2 + in13; \
2409 out3 = in3 + in12; \
2410 out4 = in4 + in11; \
2411 out5 = in5 + in10; \
2417 out10 = in5 - in10; \
2418 out11 = in4 - in11; \
2419 out12 = in3 - in12; \
2420 out13 = in2 - in13; \
2421 out14 = in1 - in14; \
2422 out15 = in0 - in15; \
2431 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2433 v16i8 zero_m = { 0 }; \
2434 v16i8 s0_m, s1_m, s2_m, s3_m; \
2436 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2437 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2439 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2440 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2441 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2442 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2451 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2452 out0, out1, out2, out3) \
2454 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2456 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2457 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2458 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2460 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2461 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2463 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2464 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2465 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2467 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2468 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2478 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2479 out0, out1, out2, out3, out4, out5, out6, out7) \
2481 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2482 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2484 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2485 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2486 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2487 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2488 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2489 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2490 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2491 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2493 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2494 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2503 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2504 in8, in9, in10, in11, in12, in13, in14, in15, \
2505 out0, out1, out2, out3) \
2507 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2509 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2510 out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2512 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2513 out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2515 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2517 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2518 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2520 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2521 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2522 out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2523 out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2525 tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2526 tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2527 out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2528 out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2538 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2539 in8, in9, in10, in11, in12, in13, in14, in15, \
2540 out0, out1, out2, out3, out4, out5, out6, out7) \
2542 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2543 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2545 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2546 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2547 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2548 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2550 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2551 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2552 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2553 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2554 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2555 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2556 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2557 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2559 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2560 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2561 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2563 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2564 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2565 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2566 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2568 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2569 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2570 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2572 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2573 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2574 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2575 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2576 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2577 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2586 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2590 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2591 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2592 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2593 out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2602 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2603 out0, out1, out2, out3, out4, out5, out6, out7) \
2606 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2607 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2609 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2610 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2611 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2612 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2613 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2614 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2615 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2616 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2617 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2618 tmp3_m, tmp7_m, out0, out2, out4, out6); \
2619 out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2620 out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2621 out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2622 out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2624 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2625 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2633 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2635 v4i32 s0_m, s1_m, s2_m, s3_m; \
2637 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2638 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2640 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2641 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2642 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2643 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2662 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2664 uint64_t out0_m, out1_m, out2_m, out3_m; \
2665 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2667 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2668 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2669 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2670 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2672 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2673 out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2674 out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2675 out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2676 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2695 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2697 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2699 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2700 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2701 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2702 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2704 ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2723 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2725 uint64_t out0_m, out1_m, out2_m, out3_m; \
2726 v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2728 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2729 tp0_m, tp1_m, tp2_m, tp3_m); \
2731 out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2732 out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2733 out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2734 out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2735 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2754 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2756 v16u8 t0_m, t1_m, t2_m, t3_m; \
2758 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2759 t0_m, t1_m, t2_m, t3_m); \
2760 ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2780 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2783 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2784 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2786 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2787 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2788 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2789 AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2790 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2810 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2813 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2814 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2816 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2817 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2818 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2819 AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2820 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2830 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2832 uint32_t src0_m, src1_m, src2_m, src3_m; \
2833 uint32_t out0_m, out1_m, out2_m, out3_m; \
2834 v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2835 v16i8 dst0_m = { 0 }; \
2836 v16i8 dst1_m = { 0 }; \
2837 v16i8 zero_m = { 0 }; \
2839 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2840 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2841 INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2842 INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2843 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2844 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2845 CLIP_SH2_0_255(res0_m, res1_m); \
2846 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2848 out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2849 out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2850 out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2851 out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2852 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2866 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2871 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2872 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2873 tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2874 out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2887 #define PCKEV_XORI128_UB(in0, in1) \
2890 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2891 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2900 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2901 dst0, dst1, dst2, dst3, pdst, stride) \
2903 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2904 uint8_t *pdst_m = (uint8_t *) (pdst); \
2906 tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2907 tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2908 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
2909 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
2910 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2918 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2920 uint32_t out0_m, out1_m, out2_m, out3_m; \
2921 v16i8 tmp0_m, tmp1_m; \
2923 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2925 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2926 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2927 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2928 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2930 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2937 #define PCKEV_ST_SB(in0, in1, pdst) \
2940 tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2941 ST_SB(tmp_m, (pdst)); \
2947 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2952 tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2953 tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2954 tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2955 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \