FFmpeg
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #if HAVE_MSA2
29 #include <msa2.h>
30 #endif
31 
32 #define ALIGNMENT 16
33 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
34 
35 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
37 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
38 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
39 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
40 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
41 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
42 
43 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
45 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
46 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
47 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
48 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
50 
51 #if (__mips_isa_rev >= 6)
52  #define LH(psrc) \
53  ( { \
54  uint16_t val_lh_m = *(uint16_t *)(psrc); \
55  val_lh_m; \
56  } )
57 
58  #define LW(psrc) \
59  ( { \
60  uint32_t val_lw_m = *(uint32_t *)(psrc); \
61  val_lw_m; \
62  } )
63 
64  #if (__mips == 64)
65  #define LD(psrc) \
66  ( { \
67  uint64_t val_ld_m = *(uint64_t *)(psrc); \
68  val_ld_m; \
69  } )
70  #else // !(__mips == 64)
71  #define LD(psrc) \
72  ( { \
73  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
74  uint32_t val0_ld_m, val1_ld_m; \
75  uint64_t val_ld_m = 0; \
76  \
77  val0_ld_m = LW(psrc_ld_m); \
78  val1_ld_m = LW(psrc_ld_m + 4); \
79  \
80  val_ld_m = (uint64_t) (val1_ld_m); \
81  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
82  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
83  \
84  val_ld_m; \
85  } )
86  #endif // (__mips == 64)
87 
88  #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
89  #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
90  #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
91 
92 #else // !(__mips_isa_rev >= 6)
93  #define LH(psrc) \
94  ( { \
95  uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
96  uint16_t val_lh_m; \
97  \
98  __asm__ volatile ( \
99  "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
100  \
101  : [val_lh_m] "=r" (val_lh_m) \
102  : [psrc_lh_m] "m" (*psrc_lh_m) \
103  ); \
104  \
105  val_lh_m; \
106  } )
107 
108  #define LW(psrc) \
109  ( { \
110  uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
111  uint32_t val_lw_m; \
112  \
113  __asm__ volatile ( \
114  "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
115  \
116  : [val_lw_m] "=r" (val_lw_m) \
117  : [psrc_lw_m] "m" (*psrc_lw_m) \
118  ); \
119  \
120  val_lw_m; \
121  } )
122 
123  #if (__mips == 64)
124  #define LD(psrc) \
125  ( { \
126  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
127  uint64_t val_ld_m = 0; \
128  \
129  __asm__ volatile ( \
130  "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
131  \
132  : [val_ld_m] "=r" (val_ld_m) \
133  : [psrc_ld_m] "m" (*psrc_ld_m) \
134  ); \
135  \
136  val_ld_m; \
137  } )
138  #else // !(__mips == 64)
139  #define LD(psrc) \
140  ( { \
141  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
142  uint32_t val0_ld_m, val1_ld_m; \
143  uint64_t val_ld_m = 0; \
144  \
145  val0_ld_m = LW(psrc_ld_m); \
146  val1_ld_m = LW(psrc_ld_m + 4); \
147  \
148  val_ld_m = (uint64_t) (val1_ld_m); \
149  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
150  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
151  \
152  val_ld_m; \
153  } )
154  #endif // (__mips == 64)
155 
156  #define SH(val, pdst) \
157  { \
158  uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
159  uint16_t val_sh_m = (val); \
160  \
161  __asm__ volatile ( \
162  "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
163  \
164  : [pdst_sh_m] "=m" (*pdst_sh_m) \
165  : [val_sh_m] "r" (val_sh_m) \
166  ); \
167  }
168 
169  #define SW(val, pdst) \
170  { \
171  uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
172  uint32_t val_sw_m = (val); \
173  \
174  __asm__ volatile ( \
175  "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
176  \
177  : [pdst_sw_m] "=m" (*pdst_sw_m) \
178  : [val_sw_m] "r" (val_sw_m) \
179  ); \
180  }
181 
182  #define SD(val, pdst) \
183  { \
184  uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
185  uint32_t val0_sd_m, val1_sd_m; \
186  \
187  val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
188  val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
189  \
190  SW(val0_sd_m, pdst_sd_m); \
191  SW(val1_sd_m, pdst_sd_m + 4); \
192  }
193 #endif // (__mips_isa_rev >= 6)
194 
195 /* Description : Load 4 words with stride
196  Arguments : Inputs - psrc (source pointer to load from)
197  - stride
198  Outputs - out0, out1, out2, out3
199  Details : Loads word in 'out0' from (psrc)
200  Loads word in 'out1' from (psrc + stride)
201  Loads word in 'out2' from (psrc + 2 * stride)
202  Loads word in 'out3' from (psrc + 3 * stride)
203 */
204 #define LW4(psrc, stride, out0, out1, out2, out3) \
205 { \
206  out0 = LW((psrc)); \
207  out1 = LW((psrc) + stride); \
208  out2 = LW((psrc) + 2 * stride); \
209  out3 = LW((psrc) + 3 * stride); \
210 }
211 
212 #define LW2(psrc, stride, out0, out1) \
213 { \
214  out0 = LW((psrc)); \
215  out1 = LW((psrc) + stride); \
216 }
217 
218 /* Description : Load double words with stride
219  Arguments : Inputs - psrc (source pointer to load from)
220  - stride
221  Outputs - out0, out1
222  Details : Loads double word in 'out0' from (psrc)
223  Loads double word in 'out1' from (psrc + stride)
224 */
225 #define LD2(psrc, stride, out0, out1) \
226 { \
227  out0 = LD((psrc)); \
228  out1 = LD((psrc) + stride); \
229 }
230 #define LD4(psrc, stride, out0, out1, out2, out3) \
231 { \
232  LD2((psrc), stride, out0, out1); \
233  LD2((psrc) + 2 * stride, stride, out2, out3); \
234 }
235 
236 /* Description : Store 4 words with stride
237  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
238  Details : Stores word from 'in0' to (pdst)
239  Stores word from 'in1' to (pdst + stride)
240  Stores word from 'in2' to (pdst + 2 * stride)
241  Stores word from 'in3' to (pdst + 3 * stride)
242 */
243 #define SW4(in0, in1, in2, in3, pdst, stride) \
244 { \
245  SW(in0, (pdst)) \
246  SW(in1, (pdst) + stride); \
247  SW(in2, (pdst) + 2 * stride); \
248  SW(in3, (pdst) + 3 * stride); \
249 }
250 
251 /* Description : Store 4 double words with stride
252  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
253  Details : Stores double word from 'in0' to (pdst)
254  Stores double word from 'in1' to (pdst + stride)
255  Stores double word from 'in2' to (pdst + 2 * stride)
256  Stores double word from 'in3' to (pdst + 3 * stride)
257 */
258 #define SD4(in0, in1, in2, in3, pdst, stride) \
259 { \
260  SD(in0, (pdst)) \
261  SD(in1, (pdst) + stride); \
262  SD(in2, (pdst) + 2 * stride); \
263  SD(in3, (pdst) + 3 * stride); \
264 }
265 
266 /* Description : Load vector elements with stride
267  Arguments : Inputs - psrc (source pointer to load from)
268  - stride
269  Outputs - out0, out1
270  Return Type - as per RTYPE
271  Details : Loads elements in 'out0' from (psrc)
272  Loads elements in 'out1' from (psrc + stride)
273 */
274 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
275 { \
276  out0 = LD_V(RTYPE, (psrc)); \
277  out1 = LD_V(RTYPE, (psrc) + stride); \
278 }
279 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
280 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
281 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
282 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
283 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
284 
285 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
286 { \
287  LD_V2(RTYPE, (psrc), stride, out0, out1); \
288  out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
289 }
290 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
291 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
292 
293 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
294 { \
295  LD_V2(RTYPE, (psrc), stride, out0, out1); \
296  LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
297 }
298 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
299 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
300 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
301 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
302 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
303 
304 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
305 { \
306  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
307  out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
308 }
309 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
310 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
311 
312 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
313 { \
314  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
315  LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
316 }
317 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
318 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
319 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
320 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
321 
322 #define LD_V7(RTYPE, psrc, stride, \
323  out0, out1, out2, out3, out4, out5, out6) \
324 { \
325  LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
326  LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
327 }
328 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
329 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
330 
331 #define LD_V8(RTYPE, psrc, stride, \
332  out0, out1, out2, out3, out4, out5, out6, out7) \
333 { \
334  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
335  LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
336 }
337 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
338 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
339 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
340 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
341 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
342 
343 #define LD_V16(RTYPE, psrc, stride, \
344  out0, out1, out2, out3, out4, out5, out6, out7, \
345  out8, out9, out10, out11, out12, out13, out14, out15) \
346 { \
347  LD_V8(RTYPE, (psrc), stride, \
348  out0, out1, out2, out3, out4, out5, out6, out7); \
349  LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
350  out8, out9, out10, out11, out12, out13, out14, out15); \
351 }
352 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
353 
354 /* Description : Store vectors with stride
355  Arguments : Inputs - in0, in1, stride
356  Outputs - pdst (destination pointer to store to)
357  Details : Stores elements from 'in0' to (pdst)
358  Stores elements from 'in1' to (pdst + stride)
359 */
360 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
361 { \
362  ST_V(RTYPE, in0, (pdst)); \
363  ST_V(RTYPE, in1, (pdst) + stride); \
364 }
365 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
366 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
367 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
368 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
369 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
370 
371 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
372 { \
373  ST_V2(RTYPE, in0, in1, (pdst), stride); \
374  ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
375 }
376 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
377 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
378 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
379 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
380 
381 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
382 { \
383  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
384  ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
385 }
386 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
387 
388 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
389 { \
390  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
391  ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
392 }
393 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
394 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
395 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
396 
397 /* Description : Store half word elements of vector with stride
398  * Arguments : Inputs - in source vector
399  * - pdst (destination pointer to store to)
400  * - stride
401  * Details : Stores half word 'idx0' from 'in' to (pdst)
402  * Stores half word 'idx1' from 'in' to (pdst + stride)
403  * Similar for other elements
404  */
405 #define ST_H1(in, idx, pdst) \
406 { \
407  uint16_t out0_m; \
408  out0_m = __msa_copy_u_h((v8i16) in, idx); \
409  SH(out0_m, (pdst)); \
410 }
411 #define ST_H2(in, idx0, idx1, pdst, stride) \
412 { \
413  uint16_t out0_m, out1_m; \
414  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
415  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
416  SH(out0_m, (pdst)); \
417  SH(out1_m, (pdst) + stride); \
418 }
419 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
420 { \
421  uint16_t out0_m, out1_m, out2_m, out3_m; \
422  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
423  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
424  out2_m = __msa_copy_u_h((v8i16) in, idx2); \
425  out3_m = __msa_copy_u_h((v8i16) in, idx3); \
426  SH(out0_m, (pdst)); \
427  SH(out1_m, (pdst) + stride); \
428  SH(out2_m, (pdst) + 2 * stride); \
429  SH(out3_m, (pdst) + 3 * stride); \
430 }
431 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
432  idx6, idx7, pdst, stride) \
433 { \
434  ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
435  ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
436 }
437 
438 /* Description : Store word elements of vector with stride
439  * Arguments : Inputs - in source vector
440  * - pdst (destination pointer to store to)
441  * - stride
442  * Details : Stores word 'idx0' from 'in' to (pdst)
443  * Stores word 'idx1' from 'in' to (pdst + stride)
444  * Similar for other elements
445  */
446 #define ST_W1(in, idx, pdst) \
447 { \
448  uint32_t out0_m; \
449  out0_m = __msa_copy_u_w((v4i32) in, idx); \
450  SW(out0_m, (pdst)); \
451 }
452 #define ST_W2(in, idx0, idx1, pdst, stride) \
453 { \
454  uint32_t out0_m, out1_m; \
455  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
456  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
457  SW(out0_m, (pdst)); \
458  SW(out1_m, (pdst) + stride); \
459 }
460 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
461 { \
462  uint32_t out0_m, out1_m, out2_m, out3_m; \
463  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
464  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
465  out2_m = __msa_copy_u_w((v4i32) in, idx2); \
466  out3_m = __msa_copy_u_w((v4i32) in, idx3); \
467  SW(out0_m, (pdst)); \
468  SW(out1_m, (pdst) + stride); \
469  SW(out2_m, (pdst) + 2*stride); \
470  SW(out3_m, (pdst) + 3*stride); \
471 }
472 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
473  idx4, idx5, idx6, idx7, pdst, stride) \
474 { \
475  ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
476  ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
477 }
478 
479 /* Description : Store double word elements of vector with stride
480  * Arguments : Inputs - in source vector
481  * - pdst (destination pointer to store to)
482  * - stride
483  * Details : Stores double word 'idx0' from 'in' to (pdst)
484  * Stores double word 'idx1' from 'in' to (pdst + stride)
485  * Similar for other elements
486  */
487 #define ST_D1(in, idx, pdst) \
488 { \
489  uint64_t out0_m; \
490  out0_m = __msa_copy_u_d((v2i64) in, idx); \
491  SD(out0_m, (pdst)); \
492 }
493 #define ST_D2(in, idx0, idx1, pdst, stride) \
494 { \
495  uint64_t out0_m, out1_m; \
496  out0_m = __msa_copy_u_d((v2i64) in, idx0); \
497  out1_m = __msa_copy_u_d((v2i64) in, idx1); \
498  SD(out0_m, (pdst)); \
499  SD(out1_m, (pdst) + stride); \
500 }
501 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
502 { \
503  uint64_t out0_m, out1_m, out2_m, out3_m; \
504  out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
505  out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
506  out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
507  out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
508  SD(out0_m, (pdst)); \
509  SD(out1_m, (pdst) + stride); \
510  SD(out2_m, (pdst) + 2 * stride); \
511  SD(out3_m, (pdst) + 3 * stride); \
512 }
513 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
514  idx4, idx5, idx6, idx7, pdst, stride) \
515 { \
516  ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
517  ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
518 }
519 
520 /* Description : Store as 12x8 byte block to destination memory from
521  input vectors
522  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
523  Details : Index 0 double word element from input vector 'in0' is copied
524  and stored to destination memory at (pblk_12x8_m) followed by
525  index 2 word element from same input vector 'in0' at
526  (pblk_12x8_m + 8)
527  Similar to remaining lines
528 */
529 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
530 { \
531  uint64_t out0_m, out1_m, out2_m, out3_m; \
532  uint64_t out4_m, out5_m, out6_m, out7_m; \
533  uint32_t out8_m, out9_m, out10_m, out11_m; \
534  uint32_t out12_m, out13_m, out14_m, out15_m; \
535  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
536  \
537  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
538  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
539  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
540  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
541  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
542  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
543  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
544  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
545  \
546  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
547  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
548  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
549  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
550  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
551  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
552  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
553  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
554  \
555  SD(out0_m, pblk_12x8_m); \
556  SW(out8_m, pblk_12x8_m + 8); \
557  pblk_12x8_m += stride; \
558  SD(out1_m, pblk_12x8_m); \
559  SW(out9_m, pblk_12x8_m + 8); \
560  pblk_12x8_m += stride; \
561  SD(out2_m, pblk_12x8_m); \
562  SW(out10_m, pblk_12x8_m + 8); \
563  pblk_12x8_m += stride; \
564  SD(out3_m, pblk_12x8_m); \
565  SW(out11_m, pblk_12x8_m + 8); \
566  pblk_12x8_m += stride; \
567  SD(out4_m, pblk_12x8_m); \
568  SW(out12_m, pblk_12x8_m + 8); \
569  pblk_12x8_m += stride; \
570  SD(out5_m, pblk_12x8_m); \
571  SW(out13_m, pblk_12x8_m + 8); \
572  pblk_12x8_m += stride; \
573  SD(out6_m, pblk_12x8_m); \
574  SW(out14_m, pblk_12x8_m + 8); \
575  pblk_12x8_m += stride; \
576  SD(out7_m, pblk_12x8_m); \
577  SW(out15_m, pblk_12x8_m + 8); \
578 }
579 
580 /* Description : average with rounding (in0 + in1 + 1) / 2.
581  Arguments : Inputs - in0, in1, in2, in3,
582  Outputs - out0, out1
583  Return Type - as per RTYPE
584  Details : Each byte element from 'in0' vector is added with each byte
585  element from 'in1' vector. The addition of the elements plus 1
586  (for rounding) is done unsigned with full precision,
587  i.e. the result has one extra bit. Unsigned division by 2
588  (or logical shift right by one bit) is performed before writing
589  the result to vector 'out0'
590  Similar for the pair of 'in2' and 'in3'
591 */
592 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
593 { \
594  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
595  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
596 }
597 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
598 
599 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
600  out0, out1, out2, out3) \
601 { \
602  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
603  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
604 }
605 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
606 
607 /* Description : Immediate number of columns to slide
608  Arguments : Inputs - s, d, slide_val
609  Outputs - out
610  Return Type - as per RTYPE
611  Details : Byte elements from 'd' vector are slide into 's' by
612  number of elements specified by 'slide_val'
613 */
614 #define SLDI_B(RTYPE, d, s, slide_val, out) \
615 { \
616  out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \
617 }
618 
619 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
620 { \
621  SLDI_B(RTYPE, d0, s0, slide_val, out0) \
622  SLDI_B(RTYPE, d1, s1, slide_val, out1) \
623 }
624 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
625 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
626 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
627 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
628 
629 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \
630  out0, out1, out2) \
631 { \
632  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
633  SLDI_B(RTYPE, d2, s2, slide_val, out2) \
634 }
635 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
636 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
637 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
638 
639 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \
640  slide_val, out0, out1, out2, out3) \
641 { \
642  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
643  SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \
644 }
645 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
646 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
647 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
648 
649 /* Description : Shuffle byte vector elements as per mask vector
650  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
651  Outputs - out0, out1
652  Return Type - as per RTYPE
653  Details : Selective byte elements from in0 & in1 are copied to out0 as
654  per control vector mask0
655  Selective byte elements from in2 & in3 are copied to out1 as
656  per control vector mask1
657 */
658 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
659 { \
660  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
661  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
662 }
663 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
664 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
665 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
666 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
667 
668 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
669  out0, out1, out2) \
670 { \
671  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
672  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
673 }
674 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
675 
676 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
677  out0, out1, out2, out3) \
678 { \
679  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
680  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
681 }
682 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
683 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
684 
685 /* Description : Shuffle halfword vector elements as per mask vector
686  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
687  Outputs - out0, out1
688  Return Type - as per RTYPE
689  Details : Selective halfword elements from in0 & in1 are copied to out0
690  as per control vector mask0
691  Selective halfword elements from in2 & in3 are copied to out1
692  as per control vector mask1
693 */
694 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
695 { \
696  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
697  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
698 }
699 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
700 
701 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
702  out0, out1, out2) \
703 { \
704  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
705  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
706 }
707 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
708 
709 /* Description : Shuffle byte vector elements as per mask vector
710  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
711  Outputs - out0, out1
712  Return Type - as per RTYPE
713  Details : Selective byte elements from in0 & in1 are copied to out0 as
714  per control vector mask0
715  Selective byte elements from in2 & in3 are copied to out1 as
716  per control vector mask1
717 */
718 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
719 { \
720  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
721  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
722 }
723 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
724 
725 /* Description : Dot product of byte vector elements
726  Arguments : Inputs - mult0, mult1
727  cnst0, cnst1
728  Outputs - out0, out1
729  Return Type - as per RTYPE
730  Details : Unsigned byte elements from mult0 are multiplied with
731  unsigned byte elements from cnst0 producing a result
732  twice the size of input i.e. unsigned halfword.
733  Then this multiplication results of adjacent odd-even elements
734  are added together and stored to the out vector
735  (2 unsigned halfword results)
736 */
737 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
738 { \
739  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
740  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
741 }
742 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
743 
744 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
745  cnst0, cnst1, cnst2, cnst3, \
746  out0, out1, out2, out3) \
747 { \
748  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
749  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
750 }
751 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
752 
753 /* Description : Dot product of byte vector elements
754  Arguments : Inputs - mult0, mult1
755  cnst0, cnst1
756  Outputs - out0, out1
757  Return Type - as per RTYPE
758  Details : Signed byte elements from mult0 are multiplied with
759  signed byte elements from cnst0 producing a result
760  twice the size of input i.e. signed halfword.
761  Then this multiplication results of adjacent odd-even elements
762  are added together and stored to the out vector
763  (2 signed halfword results)
764 */
765 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
766 { \
767  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
768  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
769 }
770 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
771 
772 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
773  out0, out1, out2) \
774 { \
775  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
776  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
777 }
778 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
779 
780 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
781  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
782 { \
783  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
784  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
785 }
786 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
787 
788 /* Description : Dot product of halfword vector elements
789  Arguments : Inputs - mult0, mult1
790  cnst0, cnst1
791  Outputs - out0, out1
792  Return Type - as per RTYPE
793  Details : Signed halfword elements from mult0 are multiplied with
794  signed halfword elements from cnst0 producing a result
795  twice the size of input i.e. signed word.
796  Then this multiplication results of adjacent odd-even elements
797  are added together and stored to the out vector
798  (2 signed word results)
799 */
800 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
801 { \
802  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
803  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
804 }
805 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
806 
807 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
808  cnst0, cnst1, cnst2, cnst3, \
809  out0, out1, out2, out3) \
810 { \
811  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
812  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
813 }
814 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
815 
816 /* Description : Dot product & addition of byte vector elements
817  Arguments : Inputs - mult0, mult1
818  cnst0, cnst1
819  Outputs - out0, out1
820  Return Type - as per RTYPE
821  Details : Signed byte elements from mult0 are multiplied with
822  signed byte elements from cnst0 producing a result
823  twice the size of input i.e. signed halfword.
824  Then this multiplication results of adjacent odd-even elements
825  are added to the out vector
826  (2 signed halfword results)
827 */
828 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
829 { \
830  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
831  (v16i8) mult0, (v16i8) cnst0); \
832  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
833  (v16i8) mult1, (v16i8) cnst1); \
834 }
835 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
836 
837 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
838  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
839 { \
840  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
841  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
842 }
843 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
844 
845 /* Description : Dot product & addition of byte vector elements
846  Arguments : Inputs - mult0, mult1
847  cnst0, cnst1
848  Outputs - out0, out1
849  Return Type - as per RTYPE
850  Details : Unsigned byte elements from mult0 are multiplied with
851  unsigned byte elements from cnst0 producing a result
852  twice the size of input i.e. unsigned halfword.
853  Then this multiplication results of adjacent odd-even elements
854  are added to the out vector
855  (2 unsigned halfword results)
856 */
857 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
858 { \
859  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
860  (v16u8) mult0, (v16u8) cnst0); \
861  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
862  (v16u8) mult1, (v16u8) cnst1); \
863 }
864 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
865 
866 /* Description : Dot product & addition of halfword vector elements
867  Arguments : Inputs - mult0, mult1
868  cnst0, cnst1
869  Outputs - out0, out1
870  Return Type - as per RTYPE
871  Details : Signed halfword elements from mult0 are multiplied with
872  signed halfword elements from cnst0 producing a result
873  twice the size of input i.e. signed word.
874  Then this multiplication results of adjacent odd-even elements
875  are added to the out vector
876  (2 signed word results)
877 */
878 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
879 { \
880  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
881  (v8i16) mult0, (v8i16) cnst0); \
882  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
883  (v8i16) mult1, (v8i16) cnst1); \
884 }
885 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
886 
887 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
888  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
889 { \
890  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
891  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
892 }
893 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
894 
895 /* Description : Minimum values between unsigned elements of
896  either vector are copied to the output vector
897  Arguments : Inputs - in0, in1, min_vec
898  Outputs - in0, in1, (in place)
899  Return Type - as per RTYPE
900  Details : Minimum of unsigned halfword element values from 'in0' and
901  'min_value' are written to output vector 'in0'
902 */
903 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
904 { \
905  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
906  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
907 }
908 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
909 
910 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
911 { \
912  MIN_UH2(RTYPE, in0, in1, min_vec); \
913  MIN_UH2(RTYPE, in2, in3, min_vec); \
914 }
915 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
916 
917 /* Description : Clips all halfword elements of input vector between min & max
918  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
919  Arguments : Inputs - in (input vector)
920  - min (min threshold)
921  - max (max threshold)
922  Outputs - in (output vector with clipped elements)
923  Return Type - signed halfword
924 */
925 #define CLIP_SH(in, min, max) \
926 { \
927  in = __msa_max_s_h((v8i16) min, (v8i16) in); \
928  in = __msa_min_s_h((v8i16) max, (v8i16) in); \
929 }
930 
931 /* Description : Clips all signed halfword elements of input vector
932  between 0 & 255
933  Arguments : Inputs - in (input vector)
934  Outputs - in (output vector with clipped elements)
935  Return Type - signed halfwords
936 */
937 #define CLIP_SH_0_255(in) \
938 { \
939  in = __msa_maxi_s_h((v8i16) in, 0); \
940  in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \
941 }
942 
943 #define CLIP_SH2_0_255(in0, in1) \
944 { \
945  CLIP_SH_0_255(in0); \
946  CLIP_SH_0_255(in1); \
947 }
948 
949 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
950 { \
951  CLIP_SH2_0_255(in0, in1); \
952  CLIP_SH2_0_255(in2, in3); \
953 }
954 
955 #define CLIP_SH8_0_255(in0, in1, in2, in3, \
956  in4, in5, in6, in7) \
957 { \
958  CLIP_SH4_0_255(in0, in1, in2, in3); \
959  CLIP_SH4_0_255(in4, in5, in6, in7); \
960 }
961 
962 /* Description : Clips all signed word elements of input vector
963  between 0 & 255
964  Arguments : Inputs - in (input vector)
965  Outputs - in (output vector with clipped elements)
966  Return Type - signed word
967 */
968 #define CLIP_SW_0_255(in) \
969 { \
970  in = __msa_maxi_s_w((v4i32) in, 0); \
971  in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \
972 }
973 
974 #define CLIP_SW2_0_255(in0, in1) \
975 { \
976  CLIP_SW_0_255(in0); \
977  CLIP_SW_0_255(in1); \
978 }
979 
980 #define CLIP_SW4_0_255(in0, in1, in2, in3) \
981 { \
982  CLIP_SW2_0_255(in0, in1); \
983  CLIP_SW2_0_255(in2, in3); \
984 }
985 
986 #define CLIP_SW8_0_255(in0, in1, in2, in3, \
987  in4, in5, in6, in7) \
988 { \
989  CLIP_SW4_0_255(in0, in1, in2, in3); \
990  CLIP_SW4_0_255(in4, in5, in6, in7); \
991 }
992 
993 /* Description : Addition of 4 signed word elements
994  4 signed word elements of input vector are added together and
995  resulted integer sum is returned
996  Arguments : Inputs - in (signed word vector)
997  Outputs - sum_m (i32 sum)
998  Return Type - signed word
999 */
1000 #define HADD_SW_S32(in) \
1001 ( { \
1002  v2i64 res0_m, res1_m; \
1003  int32_t sum_m; \
1004  \
1005  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1006  res1_m = __msa_splati_d(res0_m, 1); \
1007  res0_m += res1_m; \
1008  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1009  sum_m; \
1010 } )
1011 
1012 /* Description : Addition of 8 unsigned halfword elements
1013  8 unsigned halfword elements of input vector are added
1014  together and resulted integer sum is returned
1015  Arguments : Inputs - in (unsigned halfword vector)
1016  Outputs - sum_m (u32 sum)
1017  Return Type - unsigned word
1018 */
1019 #define HADD_UH_U32(in) \
1020 ( { \
1021  v4u32 res_m; \
1022  v2u64 res0_m, res1_m; \
1023  uint32_t sum_m; \
1024  \
1025  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1026  res0_m = __msa_hadd_u_d(res_m, res_m); \
1027  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1028  res0_m += res1_m; \
1029  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1030  sum_m; \
1031 } )
1032 
1033 /* Description : Horizontal addition of signed byte vector elements
1034  Arguments : Inputs - in0, in1
1035  Outputs - out0, out1
1036  Return Type - as per RTYPE
1037  Details : Each signed odd byte element from 'in0' is added to
1038  even signed byte element from 'in0' (pairwise) and the
1039  halfword result is stored in 'out0'
1040 */
1041 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1042 { \
1043  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1044  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1045 }
1046 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1047 
1048 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1049 { \
1050  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1051  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1052 }
1053 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1054 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1055 
1056 /* Description : Horizontal addition of unsigned byte vector elements
1057  Arguments : Inputs - in0, in1
1058  Outputs - out0, out1
1059  Return Type - as per RTYPE
1060  Details : Each unsigned odd byte element from 'in0' is added to
1061  even unsigned byte element from 'in0' (pairwise) and the
1062  halfword result is stored in 'out0'
1063 */
1064 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1065 { \
1066  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1067  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1068 }
1069 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1070 
1071 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1072 { \
1073  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1074  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1075 }
1076 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1077 
1078 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1079 { \
1080  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1081  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1082 }
1083 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1084 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1085 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1086 
1087 /* Description : Horizontal subtraction of unsigned byte vector elements
1088  Arguments : Inputs - in0, in1
1089  Outputs - out0, out1
1090  Return Type - as per RTYPE
1091  Details : Each unsigned odd byte element from 'in0' is subtracted from
1092  even unsigned byte element from 'in0' (pairwise) and the
1093  halfword result is stored in 'out0'
1094 */
1095 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1096 { \
1097  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1098  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1099 }
1100 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1101 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1102 
1103 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1104 { \
1105  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1106  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1107 }
1108 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1109 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1110 
1111 /* Description : SAD (Sum of Absolute Difference)
1112  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1113  Outputs - sad_m (halfword vector with sad)
1114  Return Type - unsigned halfword
1115  Details : Absolute difference of all the byte elements from 'in0' with
1116  'ref0' is calculated and preserved in 'diff0'. From the 16
1117  unsigned absolute diff values, even-odd pairs are added
1118  together to generate 8 halfword results.
1119 */
1120 #if HAVE_MSA2
1121 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1122 ( { \
1123  v8u16 sad_m = { 0 }; \
1124  sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
1125  sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
1126  sad_m; \
1127 } )
1128 #else
1129 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1130 ( { \
1131  v16u8 diff0_m, diff1_m; \
1132  v8u16 sad_m = { 0 }; \
1133  \
1134  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1135  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1136  \
1137  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1138  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1139  \
1140  sad_m; \
1141 } )
1142 #endif // #if HAVE_MSA2
1143 
1144 /* Description : Insert specified word elements from input vectors to 1
1145  destination vector
1146  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1147  Outputs - out (output vector)
1148  Return Type - as per RTYPE
1149 */
1150 #define INSERT_W2(RTYPE, in0, in1, out) \
1151 { \
1152  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1153  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1154 }
1155 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1156 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1157 
1158 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1159 { \
1160  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1161  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1162  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1163  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1164 }
1165 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1166 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1167 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1168 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1169 
1170 /* Description : Insert specified double word elements from input vectors to 1
1171  destination vector
1172  Arguments : Inputs - in0, in1 (2 input vectors)
1173  Outputs - out (output vector)
1174  Return Type - as per RTYPE
1175 */
1176 #define INSERT_D2(RTYPE, in0, in1, out) \
1177 { \
1178  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1179  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1180 }
1181 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1182 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1183 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1184 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1185 
1186 /* Description : Interleave even byte elements from vectors
1187  Arguments : Inputs - in0, in1, in2, in3
1188  Outputs - out0, out1
1189  Return Type - as per RTYPE
1190  Details : Even byte elements of 'in0' and even byte
1191  elements of 'in1' are interleaved and copied to 'out0'
1192  Even byte elements of 'in2' and even byte
1193  elements of 'in3' are interleaved and copied to 'out1'
1194 */
1195 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1196 { \
1197  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1198  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1199 }
1200 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1201 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1202 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1203 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1204 
1205 /* Description : Interleave even halfword elements from vectors
1206  Arguments : Inputs - in0, in1, in2, in3
1207  Outputs - out0, out1
1208  Return Type - as per RTYPE
1209  Details : Even halfword elements of 'in0' and even halfword
1210  elements of 'in1' are interleaved and copied to 'out0'
1211  Even halfword elements of 'in2' and even halfword
1212  elements of 'in3' are interleaved and copied to 'out1'
1213 */
1214 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1215 { \
1216  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1217  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1218 }
1219 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1220 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1221 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1222 
1223 /* Description : Interleave even word elements from vectors
1224  Arguments : Inputs - in0, in1, in2, in3
1225  Outputs - out0, out1
1226  Return Type - as per RTYPE
1227  Details : Even word elements of 'in0' and even word
1228  elements of 'in1' are interleaved and copied to 'out0'
1229  Even word elements of 'in2' and even word
1230  elements of 'in3' are interleaved and copied to 'out1'
1231 */
1232 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1233 { \
1234  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1235  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1236 }
1237 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1238 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1239 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1240 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1241 
1242 /* Description : Interleave even double word elements from vectors
1243  Arguments : Inputs - in0, in1, in2, in3
1244  Outputs - out0, out1
1245  Return Type - as per RTYPE
1246  Details : Even double word elements of 'in0' and even double word
1247  elements of 'in1' are interleaved and copied to 'out0'
1248  Even double word elements of 'in2' and even double word
1249  elements of 'in3' are interleaved and copied to 'out1'
1250 */
1251 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1252 { \
1253  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1254  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1255 }
1256 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1257 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1258 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1259 
1260 /* Description : Interleave left half of byte elements from vectors
1261  Arguments : Inputs - in0, in1, in2, in3
1262  Outputs - out0, out1
1263  Return Type - as per RTYPE
1264  Details : Left half of byte elements of in0 and left half of byte
1265  elements of in1 are interleaved and copied to out0.
1266  Left half of byte elements of in2 and left half of byte
1267  elements of in3 are interleaved and copied to out1.
1268 */
1269 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1270 { \
1271  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1272  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1273 }
1274 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1275 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1276 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1277 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1278 
1279 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1280  out0, out1, out2, out3) \
1281 { \
1282  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1283  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1284 }
1285 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1286 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1287 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1288 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1289 
1290 /* Description : Interleave left half of halfword elements from vectors
1291  Arguments : Inputs - in0, in1, in2, in3
1292  Outputs - out0, out1
1293  Return Type - as per RTYPE
1294  Details : Left half of halfword elements of in0 and left half of halfword
1295  elements of in1 are interleaved and copied to out0.
1296  Left half of halfword elements of in2 and left half of halfword
1297  elements of in3 are interleaved and copied to out1.
1298 */
1299 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1300 { \
1301  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1302  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1303 }
1304 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1305 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1306 
1307 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1308  out0, out1, out2, out3) \
1309 { \
1310  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1311  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1312 }
1313 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1314 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1315 
1316 /* Description : Interleave left half of word elements from vectors
1317  Arguments : Inputs - in0, in1, in2, in3
1318  Outputs - out0, out1
1319  Return Type - as per RTYPE
1320  Details : Left half of word elements of in0 and left half of word
1321  elements of in1 are interleaved and copied to out0.
1322  Left half of word elements of in2 and left half of word
1323  elements of in3 are interleaved and copied to out1.
1324 */
1325 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1326 { \
1327  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1328  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1329 }
1330 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1331 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1332 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1333 
1334 /* Description : Interleave right half of byte elements from vectors
1335  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1336  Outputs - out0, out1, out2, out3
1337  Return Type - as per RTYPE
1338  Details : Right half of byte elements of in0 and right half of byte
1339  elements of in1 are interleaved and copied to out0.
1340  Right half of byte elements of in2 and right half of byte
1341  elements of in3 are interleaved and copied to out1.
1342  Similar for other pairs
1343 */
1344 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1345 { \
1346  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1347  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1348 }
1349 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1350 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1351 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1352 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1353 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1354 
1355 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1356 { \
1357  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1358  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1359 }
1360 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1361 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1362 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1363 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1364 
1365 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1366  out0, out1, out2, out3) \
1367 { \
1368  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1369  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1370 }
1371 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1372 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1373 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1374 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1375 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1376 
1377 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1378  in8, in9, in10, in11, in12, in13, in14, in15, \
1379  out0, out1, out2, out3, out4, out5, out6, out7) \
1380 { \
1381  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1382  out0, out1, out2, out3); \
1383  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1384  out4, out5, out6, out7); \
1385 }
1386 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1387 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1388 
1389 /* Description : Interleave right half of halfword elements from vectors
1390  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1391  Outputs - out0, out1, out2, out3
1392  Return Type - as per RTYPE
1393  Details : Right half of halfword elements of in0 and right half of
1394  halfword elements of in1 are interleaved and copied to out0.
1395  Right half of halfword elements of in2 and right half of
1396  halfword elements of in3 are interleaved and copied to out1.
1397  Similar for other pairs
1398 */
1399 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1400 { \
1401  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1402  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1403 }
1404 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1405 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1406 
1407 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1408 { \
1409  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1410  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1411 }
1412 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1413 
1414 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1415  out0, out1, out2, out3) \
1416 { \
1417  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1418  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1419 }
1420 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1421 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1422 
1423 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1424 { \
1425  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1426  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1427 }
1428 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1429 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1430 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1431 
1432 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1433  out0, out1, out2, out3) \
1434 { \
1435  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1436  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1437 }
1438 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1439 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1440 
1441 /* Description : Interleave right half of double word elements from vectors
1442  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1443  Outputs - out0, out1, out2, out3
1444  Return Type - as per RTYPE
1445  Details : Right half of double word elements of in0 and right half of
1446  double word elements of in1 are interleaved and copied to out0.
1447  Right half of double word elements of in2 and right half of
1448  double word elements of in3 are interleaved and copied to out1.
1449 */
1450 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1451 { \
1452  out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1453  out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1454 }
1455 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1456 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1457 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1458 
1459 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1460 { \
1461  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1462  out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1463 }
1464 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1465 
1466 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1467  out0, out1, out2, out3) \
1468 { \
1469  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1470  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1471 }
1472 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1473 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1474 
1475 /* Description : Interleave left half of double word elements from vectors
1476  Arguments : Inputs - in0, in1, in2, in3
1477  Outputs - out0, out1
1478  Return Type - as per RTYPE
1479  Details : Left half of double word elements of in0 and left half of
1480  double word elements of in1 are interleaved and copied to out0.
1481  Left half of double word elements of in2 and left half of
1482  double word elements of in3 are interleaved and copied to out1.
1483 */
1484 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1485 { \
1486  out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1487  out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1488 }
1489 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1490 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1491 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1492 
1493 /* Description : Interleave both left and right half of input vectors
1494  Arguments : Inputs - in0, in1
1495  Outputs - out0, out1
1496  Return Type - as per RTYPE
1497  Details : Right half of byte elements from 'in0' and 'in1' are
1498  interleaved and stored to 'out0'
1499  Left half of byte elements from 'in0' and 'in1' are
1500  interleaved and stored to 'out1'
1501 */
1502 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1503 { \
1504  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1505  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1506 }
1507 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1508 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1509 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1510 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1511 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1512 
1513 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1514 { \
1515  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1516  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1517 }
1518 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1519 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1520 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1521 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1522 
1523 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1524 { \
1525  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1526  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1527 }
1528 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1529 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1530 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1531 
1532 /* Description : Maximum values between signed elements of vector and
1533  5-bit signed immediate value are copied to the output vector
1534  Arguments : Inputs - in0, in1, in2, in3, max_val
1535  Outputs - in0, in1, in2, in3 (in place)
1536  Return Type - as per RTYPE
1537  Details : Maximum of signed halfword element values from 'in0' and
1538  'max_val' are written to output vector 'in0'
1539 */
1540 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1541 { \
1542  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1543  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1544 }
1545 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1546 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1547 
1548 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1549 { \
1550  MAXI_SH2(RTYPE, in0, in1, max_val); \
1551  MAXI_SH2(RTYPE, in2, in3, max_val); \
1552 }
1553 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1554 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1555 
1556 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1557 { \
1558  MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1559  MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1560 }
1561 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1562 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1563 
1564 /* Description : Saturate the halfword element values to the max
1565  unsigned value of (sat_val+1 bits)
1566  The element data width remains unchanged
1567  Arguments : Inputs - in0, in1, in2, in3, sat_val
1568  Outputs - in0, in1, in2, in3 (in place)
1569  Return Type - as per RTYPE
1570  Details : Each unsigned halfword element from 'in0' is saturated to the
1571  value generated with (sat_val+1) bit range
1572  Results are in placed to original vectors
1573 */
1574 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1575 { \
1576  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1577  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1578 }
1579 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1580 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1581 
1582 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1583 { \
1584  SAT_UH2(RTYPE, in0, in1, sat_val); \
1585  SAT_UH2(RTYPE, in2, in3, sat_val); \
1586 }
1587 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1588 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1589 
1590 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1591 { \
1592  SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1593  SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1594 }
1595 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1596 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1597 
1598 /* Description : Saturate the halfword element values to the max
1599  unsigned value of (sat_val+1 bits)
1600  The element data width remains unchanged
1601  Arguments : Inputs - in0, in1, in2, in3, sat_val
1602  Outputs - in0, in1, in2, in3 (in place)
1603  Return Type - as per RTYPE
1604  Details : Each unsigned halfword element from 'in0' is saturated to the
1605  value generated with (sat_val+1) bit range
1606  Results are in placed to original vectors
1607 */
1608 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1609 { \
1610  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1611  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1612 }
1613 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1614 
1615 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1616 { \
1617  SAT_SH2(RTYPE, in0, in1, sat_val); \
1618  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1619 }
1620 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1621 
1622 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1623 { \
1624  SAT_SH2(RTYPE, in0, in1, sat_val); \
1625  SAT_SH2(RTYPE, in2, in3, sat_val); \
1626 }
1627 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1628 
1629 /* Description : Saturate the word element values to the max
1630  unsigned value of (sat_val+1 bits)
1631  The element data width remains unchanged
1632  Arguments : Inputs - in0, in1, in2, in3, sat_val
1633  Outputs - in0, in1, in2, in3 (in place)
1634  Return Type - as per RTYPE
1635  Details : Each unsigned word element from 'in0' is saturated to the
1636  value generated with (sat_val+1) bit range
1637  Results are in placed to original vectors
1638 */
1639 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1640 { \
1641  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1642  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1643 }
1644 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1645 
1646 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1647 { \
1648  SAT_SW2(RTYPE, in0, in1, sat_val); \
1649  SAT_SW2(RTYPE, in2, in3, sat_val); \
1650 }
1651 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1652 
1653 /* Description : Indexed halfword element values are replicated to all
1654  elements in output vector
1655  Arguments : Inputs - in, idx0, idx1
1656  Outputs - out0, out1
1657  Return Type - as per RTYPE
1658  Details : 'idx0' element value from 'in' vector is replicated to all
1659  elements in 'out0' vector
1660  Valid index range for halfword operation is 0-7
1661 */
1662 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1663 { \
1664  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1665  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1666 }
1667 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1668 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1669 
1670 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1671  out0, out1, out2) \
1672 { \
1673  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1674  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1675 }
1676 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1677 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1678 
1679 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1680  out0, out1, out2, out3) \
1681 { \
1682  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1683  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1684 }
1685 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1686 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1687 
1688 /* Description : Indexed word element values are replicated to all
1689  elements in output vector
1690  Arguments : Inputs - in, stidx
1691  Outputs - out0, out1
1692  Return Type - as per RTYPE
1693  Details : 'stidx' element value from 'in' vector is replicated to all
1694  elements in 'out0' vector
1695  'stidx + 1' element value from 'in' vector is replicated to all
1696  elements in 'out1' vector
1697  Valid index range for halfword operation is 0-3
1698 */
1699 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1700 { \
1701  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1702  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1703 }
1704 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1705 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1706 
1707 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1708 { \
1709  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1710  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1711 }
1712 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1713 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1714 
1715 /* Description : Pack even byte elements of vector pairs
1716  Arguments : Inputs - in0, in1, in2, in3
1717  Outputs - out0, out1
1718  Return Type - as per RTYPE
1719  Details : Even byte elements of in0 are copied to the left half of
1720  out0 & even byte elements of in1 are copied to the right
1721  half of out0.
1722  Even byte elements of in2 are copied to the left half of
1723  out1 & even byte elements of in3 are copied to the right
1724  half of out1.
1725 */
1726 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1727 { \
1728  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1729  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1730 }
1731 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1732 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1733 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1734 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1735 
1736 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1737 { \
1738  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1739  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1740 }
1741 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1742 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1743 
1744 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1745  out0, out1, out2, out3) \
1746 { \
1747  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1748  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1749 }
1750 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1751 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1752 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1753 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1754 
1755 /* Description : Pack even halfword elements of vector pairs
1756  Arguments : Inputs - in0, in1, in2, in3
1757  Outputs - out0, out1
1758  Return Type - as per RTYPE
1759  Details : Even halfword elements of in0 are copied to the left half of
1760  out0 & even halfword elements of in1 are copied to the right
1761  half of out0.
1762  Even halfword elements of in2 are copied to the left half of
1763  out1 & even halfword elements of in3 are copied to the right
1764  half of out1.
1765 */
1766 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1767 { \
1768  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1769  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1770 }
1771 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1772 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1773 
1774 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1775  out0, out1, out2, out3) \
1776 { \
1777  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1778  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1779 }
1780 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1781 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1782 
1783 /* Description : Pack even double word elements of vector pairs
1784  Arguments : Inputs - in0, in1, in2, in3
1785  Outputs - out0, out1
1786  Return Type - as per RTYPE
1787  Details : Even double elements of in0 are copied to the left half of
1788  out0 & even double elements of in1 are copied to the right
1789  half of out0.
1790  Even double elements of in2 are copied to the left half of
1791  out1 & even double elements of in3 are copied to the right
1792  half of out1.
1793 */
1794 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1795 { \
1796  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1797  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1798 }
1799 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1800 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1801 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1802 
1803 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1804  out0, out1, out2, out3) \
1805 { \
1806  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1807  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1808 }
1809 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1810 
1811 /* Description : Pack odd double word elements of vector pairs
1812  Arguments : Inputs - in0, in1
1813  Outputs - out0, out1
1814  Return Type - as per RTYPE
1815  Details : As operation is on same input 'in0' vector, index 1 double word
1816  element is overwritten to index 0 and result is written to out0
1817  As operation is on same input 'in1' vector, index 1 double word
1818  element is overwritten to index 0 and result is written to out1
1819 */
1820 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1821 { \
1822  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1823  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1824 }
1825 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1826 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1827 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1828 
1829 /* Description : Each byte element is logically xor'ed with immediate 128
1830  Arguments : Inputs - in0, in1
1831  Outputs - in0, in1 (in-place)
1832  Return Type - as per RTYPE
1833  Details : Each unsigned byte element from input vector 'in0' is
1834  logically xor'ed with 128 and result is in-place stored in
1835  'in0' vector
1836  Each unsigned byte element from input vector 'in1' is
1837  logically xor'ed with 128 and result is in-place stored in
1838  'in1' vector
1839  Similar for other pairs
1840 */
1841 #define XORI_B2_128(RTYPE, in0, in1) \
1842 { \
1843  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1844  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1845 }
1846 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1847 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1848 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1849 
1850 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1851 { \
1852  XORI_B2_128(RTYPE, in0, in1); \
1853  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1854 }
1855 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1856 
1857 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1858 { \
1859  XORI_B2_128(RTYPE, in0, in1); \
1860  XORI_B2_128(RTYPE, in2, in3); \
1861 }
1862 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1863 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1864 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1865 
1866 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1867 { \
1868  XORI_B3_128(RTYPE, in0, in1, in2); \
1869  XORI_B2_128(RTYPE, in3, in4); \
1870 }
1871 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1872 
1873 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1874 { \
1875  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1876  XORI_B2_128(RTYPE, in4, in5); \
1877 }
1878 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1879 
1880 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1881 { \
1882  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1883  XORI_B3_128(RTYPE, in4, in5, in6); \
1884 }
1885 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1886 
1887 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1888 { \
1889  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1890  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1891 }
1892 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1893 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1894 
1895 /* Description : Addition of signed halfword elements and signed saturation
1896  Arguments : Inputs - in0, in1, in2, in3
1897  Outputs - out0, out1
1898  Return Type - as per RTYPE
1899  Details : Signed halfword elements from 'in0' are added to signed
1900  halfword elements of 'in1'. The result is then signed saturated
1901  between -32768 to +32767 (as per halfword data type)
1902  Similar for other pairs
1903 */
1904 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1905 { \
1906  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1907  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1908 }
1909 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1910 
1911 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1912  out0, out1, out2, out3) \
1913 { \
1914  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1915  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1916 }
1917 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1918 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1919 
1920 /* Description : Shift left all elements of vector (generic for all data types)
1921  Arguments : Inputs - in0, in1, in2, in3, shift
1922  Outputs - in0, in1, in2, in3 (in place)
1923  Return Type - as per input vector RTYPE
1924  Details : Each element of vector 'in0' is left shifted by 'shift' and
1925  result is in place written to 'in0'
1926  Similar for other pairs
1927 */
1928 #define SLLI_2V(in0, in1, shift) \
1929 { \
1930  in0 = in0 << shift; \
1931  in1 = in1 << shift; \
1932 }
1933 #define SLLI_4V(in0, in1, in2, in3, shift) \
1934 { \
1935  in0 = in0 << shift; \
1936  in1 = in1 << shift; \
1937  in2 = in2 << shift; \
1938  in3 = in3 << shift; \
1939 }
1940 
1941 /* Description : Arithmetic shift right all elements of vector
1942  (generic for all data types)
1943  Arguments : Inputs - in0, in1, in2, in3, shift
1944  Outputs - in0, in1, in2, in3 (in place)
1945  Return Type - as per input vector RTYPE
1946  Details : Each element of vector 'in0' is right shifted by 'shift' and
1947  result is in place written to 'in0'
1948  Here, 'shift' is GP variable passed in
1949  Similar for other pairs
1950 */
1951 #define SRA_4V(in0, in1, in2, in3, shift) \
1952 { \
1953  in0 = in0 >> shift; \
1954  in1 = in1 >> shift; \
1955  in2 = in2 >> shift; \
1956  in3 = in3 >> shift; \
1957 }
1958 
1959 /* Description : Shift right logical all halfword elements of vector
1960  Arguments : Inputs - in0, in1, in2, in3, shift
1961  Outputs - in0, in1, in2, in3 (in place)
1962  Return Type - as per RTYPE
1963  Details : Each element of vector 'in0' is shifted right logical by
1964  number of bits respective element holds in vector 'shift' and
1965  result is in place written to 'in0'
1966  Here, 'shift' is a vector passed in
1967  Similar for other pairs
1968 */
1969 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1970 { \
1971  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1972  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1973  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1974  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1975 }
1976 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1977 
1978 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
1979 { \
1980  in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
1981  in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
1982  in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
1983  in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
1984 }
1985 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1986 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1987 
1988 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
1989 { \
1990  SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
1991  SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
1992 }
1993 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1994 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1995 
1996 /* Description : Shift right arithmetic rounded halfwords
1997  Arguments : Inputs - in0, in1, shift
1998  Outputs - in0, in1, (in place)
1999  Return Type - as per RTYPE
2000  Details : Each element of vector 'in0' is shifted right arithmetic by
2001  number of bits respective element holds in vector 'shift'.
2002  The last discarded bit is added to shifted value for rounding
2003  and the result is in place written to 'in0'
2004  Here, 'shift' is a vector passed in
2005  Similar for other pairs
2006 */
2007 #define SRAR_H2(RTYPE, in0, in1, shift) \
2008 { \
2009  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2010  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2011 }
2012 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2013 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2014 
2015 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2016 { \
2017  SRAR_H2(RTYPE, in0, in1, shift) \
2018  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2019 }
2020 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2021 
2022 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2023 { \
2024  SRAR_H2(RTYPE, in0, in1, shift) \
2025  SRAR_H2(RTYPE, in2, in3, shift) \
2026 }
2027 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2028 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2029 
2030 /* Description : Shift right arithmetic rounded words
2031  Arguments : Inputs - in0, in1, shift
2032  Outputs - in0, in1, (in place)
2033  Return Type - as per RTYPE
2034  Details : Each element of vector 'in0' is shifted right arithmetic by
2035  number of bits respective element holds in vector 'shift'.
2036  The last discarded bit is added to shifted value for rounding
2037  and the result is in place written to 'in0'
2038  Here, 'shift' is a vector passed in
2039  Similar for other pairs
2040 */
2041 #define SRAR_W2(RTYPE, in0, in1, shift) \
2042 { \
2043  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2044  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2045 }
2046 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2047 
2048 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2049 { \
2050  SRAR_W2(RTYPE, in0, in1, shift) \
2051  SRAR_W2(RTYPE, in2, in3, shift) \
2052 }
2053 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2054 
2055 /* Description : Shift right arithmetic rounded (immediate)
2056  Arguments : Inputs - in0, in1, in2, in3, shift
2057  Outputs - in0, in1, in2, in3 (in place)
2058  Return Type - as per RTYPE
2059  Details : Each element of vector 'in0' is shifted right arithmetic by
2060  value in 'shift'.
2061  The last discarded bit is added to shifted value for rounding
2062  and the result is in place written to 'in0'
2063  Similar for other pairs
2064 */
2065 #define SRARI_H2(RTYPE, in0, in1, shift) \
2066 { \
2067  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2068  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2069 }
2070 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2071 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2072 
2073 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2074 { \
2075  SRARI_H2(RTYPE, in0, in1, shift); \
2076  SRARI_H2(RTYPE, in2, in3, shift); \
2077 }
2078 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2079 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2080 
2081 /* Description : Shift right arithmetic rounded (immediate)
2082  Arguments : Inputs - in0, in1, shift
2083  Outputs - in0, in1 (in place)
2084  Return Type - as per RTYPE
2085  Details : Each element of vector 'in0' is shifted right arithmetic by
2086  value in 'shift'.
2087  The last discarded bit is added to shifted value for rounding
2088  and the result is in place written to 'in0'
2089  Similar for other pairs
2090 */
2091 #define SRARI_W2(RTYPE, in0, in1, shift) \
2092 { \
2093  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2094  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2095 }
2096 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2097 
2098 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2099 { \
2100  SRARI_W2(RTYPE, in0, in1, shift); \
2101  SRARI_W2(RTYPE, in2, in3, shift); \
2102 }
2103 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2104 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2105 
2106 /* Description : Multiplication of pairs of vectors
2107  Arguments : Inputs - in0, in1, in2, in3
2108  Outputs - out0, out1
2109  Details : Each element from 'in0' is multiplied with elements from 'in1'
2110  and result is written to 'out0'
2111  Similar for other pairs
2112 */
2113 #define MUL2(in0, in1, in2, in3, out0, out1) \
2114 { \
2115  out0 = in0 * in1; \
2116  out1 = in2 * in3; \
2117 }
2118 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2119 { \
2120  MUL2(in0, in1, in2, in3, out0, out1); \
2121  MUL2(in4, in5, in6, in7, out2, out3); \
2122 }
2123 
2124 /* Description : Addition of 2 pairs of vectors
2125  Arguments : Inputs - in0, in1, in2, in3
2126  Outputs - out0, out1
2127  Details : Each element from 2 pairs vectors is added and 2 results are
2128  produced
2129 */
2130 #define ADD2(in0, in1, in2, in3, out0, out1) \
2131 { \
2132  out0 = in0 + in1; \
2133  out1 = in2 + in3; \
2134 }
2135 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2136 { \
2137  ADD2(in0, in1, in2, in3, out0, out1); \
2138  ADD2(in4, in5, in6, in7, out2, out3); \
2139 }
2140 
2141 /* Description : Subtraction of 2 pairs of vectors
2142  Arguments : Inputs - in0, in1, in2, in3
2143  Outputs - out0, out1
2144  Details : Each element from 2 pairs vectors is subtracted and 2 results
2145  are produced
2146 */
2147 #define SUB2(in0, in1, in2, in3, out0, out1) \
2148 { \
2149  out0 = in0 - in1; \
2150  out1 = in2 - in3; \
2151 }
2152 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2153 { \
2154  out0 = in0 - in1; \
2155  out1 = in2 - in3; \
2156  out2 = in4 - in5; \
2157  out3 = in6 - in7; \
2158 }
2159 
2160 /* Description : Sign extend byte elements from right half of the vector
2161  Arguments : Input - in (byte vector)
2162  Output - out (sign extended halfword vector)
2163  Return Type - signed halfword
2164  Details : Sign bit of byte elements from input vector 'in' is
2165  extracted and interleaved with same vector 'in' to generate
2166  8 halfword elements keeping sign intact
2167 */
2168 #define UNPCK_R_SB_SH(in, out) \
2169 { \
2170  v16i8 sign_m; \
2171  \
2172  sign_m = __msa_clti_s_b((v16i8) in, 0); \
2173  out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2174 }
2175 
2176 /* Description : Sign extend halfword elements from right half of the vector
2177  Arguments : Inputs - in (input halfword vector)
2178  Outputs - out (sign extended word vectors)
2179  Return Type - signed word
2180  Details : Sign bit of halfword elements from input vector 'in' is
2181  extracted and interleaved with same vector 'in0' to generate
2182  4 word elements keeping sign intact
2183 */
2184 #if HAVE_MSA2
2185 #define UNPCK_R_SH_SW(in, out) \
2186 { \
2187  out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2188 }
2189 #else
2190 #define UNPCK_R_SH_SW(in, out) \
2191 { \
2192  v8i16 sign_m; \
2193  \
2194  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2195  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2196 }
2197 #endif // #if HAVE_MSA2
2198 
2199 /* Description : Sign extend byte elements from input vector and return
2200  halfword results in pair of vectors
2201  Arguments : Inputs - in (1 input byte vector)
2202  Outputs - out0, out1 (sign extended 2 halfword vectors)
2203  Return Type - signed halfword
2204  Details : Sign bit of byte elements from input vector 'in' is
2205  extracted and interleaved right with same vector 'in0' to
2206  generate 8 signed halfword elements in 'out0'
2207  Then interleaved left with same vector 'in0' to
2208  generate 8 signed halfword elements in 'out1'
2209 */
2210 #if HAVE_MSA2
2211 #define UNPCK_SB_SH(in, out0, out1) \
2212 { \
2213  out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
2214  out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
2215 }
2216 #else
2217 #define UNPCK_SB_SH(in, out0, out1) \
2218 { \
2219  v16i8 tmp_m; \
2220  \
2221  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2222  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2223 }
2224 #endif // #if HAVE_MSA2
2225 
2226 /* Description : Zero extend unsigned byte elements to halfword elements
2227  Arguments : Inputs - in (1 input unsigned byte vector)
2228  Outputs - out0, out1 (unsigned 2 halfword vectors)
2229  Return Type - signed halfword
2230  Details : Zero extended right half of vector is returned in 'out0'
2231  Zero extended left half of vector is returned in 'out1'
2232 */
2233 #define UNPCK_UB_SH(in, out0, out1) \
2234 { \
2235  v16i8 zero_m = { 0 }; \
2236  \
2237  ILVRL_B2_SH(zero_m, in, out0, out1); \
2238 }
2239 
2240 /* Description : Sign extend halfword elements from input vector and return
2241  result in pair of vectors
2242  Arguments : Inputs - in (1 input halfword vector)
2243  Outputs - out0, out1 (sign extended 2 word vectors)
2244  Return Type - signed word
2245  Details : Sign bit of halfword elements from input vector 'in' is
2246  extracted and interleaved right with same vector 'in0' to
2247  generate 4 signed word elements in 'out0'
2248  Then interleaved left with same vector 'in0' to
2249  generate 4 signed word elements in 'out1'
2250 */
2251 #if HAVE_MSA2
2252 #define UNPCK_SH_SW(in, out0, out1) \
2253 { \
2254  out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2255  out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
2256 }
2257 #else
2258 #define UNPCK_SH_SW(in, out0, out1) \
2259 { \
2260  v8i16 tmp_m; \
2261  \
2262  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2263  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2264 }
2265 #endif // #if HAVE_MSA2
2266 
2267 /* Description : Swap two variables
2268  Arguments : Inputs - in0, in1
2269  Outputs - in0, in1 (in-place)
2270  Details : Swapping of two input variables using xor
2271 */
2272 #define SWAP(in0, in1) \
2273 { \
2274  in0 = in0 ^ in1; \
2275  in1 = in0 ^ in1; \
2276  in0 = in0 ^ in1; \
2277 }
2278 
2279 /* Description : Butterfly of 4 input vectors
2280  Arguments : Inputs - in0, in1, in2, in3
2281  Outputs - out0, out1, out2, out3
2282  Details : Butterfly operation
2283 */
2284 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2285 { \
2286  out0 = in0 + in3; \
2287  out1 = in1 + in2; \
2288  \
2289  out2 = in1 - in2; \
2290  out3 = in0 - in3; \
2291 }
2292 
2293 /* Description : Butterfly of 8 input vectors
2294  Arguments : Inputs - in0 ... in7
2295  Outputs - out0 .. out7
2296  Details : Butterfly operation
2297 */
2298 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2299  out0, out1, out2, out3, out4, out5, out6, out7) \
2300 { \
2301  out0 = in0 + in7; \
2302  out1 = in1 + in6; \
2303  out2 = in2 + in5; \
2304  out3 = in3 + in4; \
2305  \
2306  out4 = in3 - in4; \
2307  out5 = in2 - in5; \
2308  out6 = in1 - in6; \
2309  out7 = in0 - in7; \
2310 }
2311 
2312 /* Description : Butterfly of 16 input vectors
2313  Arguments : Inputs - in0 ... in15
2314  Outputs - out0 .. out15
2315  Details : Butterfly operation
2316 */
2317 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2318  in8, in9, in10, in11, in12, in13, in14, in15, \
2319  out0, out1, out2, out3, out4, out5, out6, out7, \
2320  out8, out9, out10, out11, out12, out13, out14, out15) \
2321 { \
2322  out0 = in0 + in15; \
2323  out1 = in1 + in14; \
2324  out2 = in2 + in13; \
2325  out3 = in3 + in12; \
2326  out4 = in4 + in11; \
2327  out5 = in5 + in10; \
2328  out6 = in6 + in9; \
2329  out7 = in7 + in8; \
2330  \
2331  out8 = in7 - in8; \
2332  out9 = in6 - in9; \
2333  out10 = in5 - in10; \
2334  out11 = in4 - in11; \
2335  out12 = in3 - in12; \
2336  out13 = in2 - in13; \
2337  out14 = in1 - in14; \
2338  out15 = in0 - in15; \
2339 }
2340 
2341 /* Description : Transposes input 4x4 byte block
2342  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2343  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2344  Return Type - unsigned byte
2345  Details :
2346 */
2347 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2348 { \
2349  v16i8 zero_m = { 0 }; \
2350  v16i8 s0_m, s1_m, s2_m, s3_m; \
2351  \
2352  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2353  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2354  \
2355  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2356  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2357  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2358  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2359 }
2360 
2361 /* Description : Transposes input 8x4 byte block into 4x8
2362  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2363  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2364  Return Type - as per RTYPE
2365  Details :
2366 */
2367 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2368  out0, out1, out2, out3) \
2369 { \
2370  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2371  \
2372  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2373  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2374  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2375  \
2376  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2377  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2378  \
2379  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2380  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2381  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2382 }
2383 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2384 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2385 
2386 /* Description : Transposes input 8x8 byte block
2387  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2388  (input 8x8 byte block)
2389  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2390  (output 8x8 byte block)
2391  Return Type - as per RTYPE
2392  Details :
2393 */
2394 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2395  out0, out1, out2, out3, out4, out5, out6, out7) \
2396 { \
2397  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2398  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2399  v16i8 zeros = { 0 }; \
2400  \
2401  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2402  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2403  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2404  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2405  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2406  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2407  SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \
2408  8, out1, out3, out5, out7); \
2409 }
2410 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2411 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2412 
2413 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2414  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2415  in8, in9, in10, in11, in12, in13, in14, in15
2416  Outputs - out0, out1, out2, out3
2417  Return Type - unsigned byte
2418  Details :
2419 */
2420 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2421  in8, in9, in10, in11, in12, in13, in14, in15, \
2422  out0, out1, out2, out3) \
2423 { \
2424  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2425  \
2426  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2427  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2428  \
2429  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2430  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2431  \
2432  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2433  \
2434  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2435  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2436  \
2437  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2438  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2439  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2440  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2441  \
2442  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2443  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2444  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2445  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2446 }
2447 
2448 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2449  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2450  in8, in9, in10, in11, in12, in13, in14, in15
2451  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2452  Return Type - unsigned byte
2453  Details :
2454 */
2455 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2456  in8, in9, in10, in11, in12, in13, in14, in15, \
2457  out0, out1, out2, out3, out4, out5, out6, out7) \
2458 { \
2459  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2460  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2461  \
2462  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2463  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2464  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2465  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2466  \
2467  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2468  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2469  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2470  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2471  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2472  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2473  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2474  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2475  \
2476  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2477  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2478  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2479  \
2480  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2481  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2482  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2483  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2484  \
2485  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2486  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2487  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2488  \
2489  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2490  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2491  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2492  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2493 }
2494 
2495 /* Description : Transposes 4x4 block with half word elements in vectors
2496  Arguments : Inputs - in0, in1, in2, in3
2497  Outputs - out0, out1, out2, out3
2498  Return Type - signed halfword
2499  Details :
2500 */
2501 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2502 { \
2503  v8i16 s0_m, s1_m; \
2504  \
2505  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2506  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2507  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2508  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2509 }
2510 
2511 /* Description : Transposes 8x8 block with half word elements in vectors
2512  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2513  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2514  Return Type - as per RTYPE
2515  Details :
2516 */
2517 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2518  out0, out1, out2, out3, out4, out5, out6, out7) \
2519 { \
2520  v8i16 s0_m, s1_m; \
2521  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2522  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2523  \
2524  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2525  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2526  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2527  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2528  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2529  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2530  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2531  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2532  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2533  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2534  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2535  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2536  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2537  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2538 }
2539 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2540 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2541 
2542 /* Description : Transposes 4x4 block with word elements in vectors
2543  Arguments : Inputs - in0, in1, in2, in3
2544  Outputs - out0, out1, out2, out3
2545  Return Type - signed word
2546  Details :
2547 */
2548 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2549 { \
2550  v4i32 s0_m, s1_m, s2_m, s3_m; \
2551  \
2552  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2553  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2554  \
2555  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2556  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2557  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2558  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2559 }
2560 
2561 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2562  block in destination memory
2563  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2564  Details : Each byte element from input vector pair 'in0' and 'in1' are
2565  averaged (a + b)/2 and stored in 'tmp0_m'
2566  Each byte element from input vector pair 'in2' and 'in3' are
2567  averaged (a + b)/2 and stored in 'tmp1_m'
2568  Each byte element from input vector pair 'in4' and 'in5' are
2569  averaged (a + b)/2 and stored in 'tmp2_m'
2570  Each byte element from input vector pair 'in6' and 'in7' are
2571  averaged (a + b)/2 and stored in 'tmp3_m'
2572  The half vector results from all 4 vectors are stored in
2573  destination memory as 8x4 byte block
2574 */
2575 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2576 { \
2577  uint64_t out0_m, out1_m, out2_m, out3_m; \
2578  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2579  \
2580  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2581  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2582  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2583  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2584  \
2585  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2586  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2587  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2588  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2589  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2590 }
2591 
2592 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2593  block in destination memory
2594  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2595  Details : Each byte element from input vector pair 'in0' and 'in1' are
2596  averaged (a + b)/2 and stored in 'tmp0_m'
2597  Each byte element from input vector pair 'in2' and 'in3' are
2598  averaged (a + b)/2 and stored in 'tmp1_m'
2599  Each byte element from input vector pair 'in4' and 'in5' are
2600  averaged (a + b)/2 and stored in 'tmp2_m'
2601  Each byte element from input vector pair 'in6' and 'in7' are
2602  averaged (a + b)/2 and stored in 'tmp3_m'
2603  The results from all 4 vectors are stored in destination
2604  memory as 16x4 byte block
2605 */
2606 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2607 { \
2608  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2609  \
2610  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2611  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2612  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2613  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2614  \
2615  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2616 }
2617 
2618 /* Description : Average rounded byte elements from pair of vectors and store
2619  8x4 byte block in destination memory
2620  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2621  Details : Each byte element from input vector pair 'in0' and 'in1' are
2622  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2623  Each byte element from input vector pair 'in2' and 'in3' are
2624  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2625  Each byte element from input vector pair 'in4' and 'in5' are
2626  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2627  Each byte element from input vector pair 'in6' and 'in7' are
2628  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2629  The half vector results from all 4 vectors are stored in
2630  destination memory as 8x4 byte block
2631 */
2632 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2633 { \
2634  uint64_t out0_m, out1_m, out2_m, out3_m; \
2635  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2636  \
2637  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2638  tp0_m, tp1_m, tp2_m, tp3_m); \
2639  \
2640  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2641  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2642  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2643  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2644  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2645 }
2646 
2647 /* Description : Average rounded byte elements from pair of vectors and store
2648  16x4 byte block in destination memory
2649  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2650  Details : Each byte element from input vector pair 'in0' and 'in1' are
2651  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2652  Each byte element from input vector pair 'in2' and 'in3' are
2653  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2654  Each byte element from input vector pair 'in4' and 'in5' are
2655  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2656  Each byte element from input vector pair 'in6' and 'in7' are
2657  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2658  The vector results from all 4 vectors are stored in
2659  destination memory as 16x4 byte block
2660 */
2661 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2662 { \
2663  v16u8 t0_m, t1_m, t2_m, t3_m; \
2664  \
2665  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2666  t0_m, t1_m, t2_m, t3_m); \
2667  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2668 }
2669 
2670 /* Description : Average rounded byte elements from pair of vectors,
2671  average rounded with destination and store 8x4 byte block
2672  in destination memory
2673  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2674  Details : Each byte element from input vector pair 'in0' and 'in1' are
2675  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2676  Each byte element from input vector pair 'in2' and 'in3' are
2677  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2678  Each byte element from input vector pair 'in4' and 'in5' are
2679  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2680  Each byte element from input vector pair 'in6' and 'in7' are
2681  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2682  The half vector results from all 4 vectors are stored in
2683  destination memory as 8x4 byte block
2684 */
2685 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2686  pdst, stride) \
2687 { \
2688  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2689  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2690  \
2691  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2692  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2693  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2694  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2695  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2696 }
2697 
2698 /* Description : Average rounded byte elements from pair of vectors,
2699  average rounded with destination and store 16x4 byte block
2700  in destination memory
2701  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2702  Details : Each byte element from input vector pair 'in0' and 'in1' are
2703  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2704  Each byte element from input vector pair 'in2' and 'in3' are
2705  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2706  Each byte element from input vector pair 'in4' and 'in5' are
2707  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2708  Each byte element from input vector pair 'in6' and 'in7' are
2709  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2710  The vector results from all 4 vectors are stored in
2711  destination memory as 16x4 byte block
2712 */
2713 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2714  pdst, stride) \
2715 { \
2716  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2717  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2718  \
2719  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2720  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2721  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2722  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2723  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2724 }
2725 
2726 /* Description : Add block 4x4
2727  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2728  Details : Least significant 4 bytes from each input vector are added to
2729  the destination bytes, clipped between 0-255 and then stored.
2730 */
2731 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2732 { \
2733  uint32_t src0_m, src1_m, src2_m, src3_m; \
2734  uint32_t out0_m, out1_m, out2_m, out3_m; \
2735  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2736  v16i8 dst0_m = { 0 }; \
2737  v16i8 dst1_m = { 0 }; \
2738  v16i8 zero_m = { 0 }; \
2739  \
2740  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2741  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2742  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2743  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2744  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2745  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2746  CLIP_SH2_0_255(res0_m, res1_m); \
2747  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2748  \
2749  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2750  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2751  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2752  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2753  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2754 }
2755 
2756 /* Description : Dot product and addition of 3 signed halfword input vectors
2757  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2758  Outputs - out0_m
2759  Return Type - signed halfword
2760  Details : Dot product of 'in0' with 'coeff0'
2761  Dot product of 'in1' with 'coeff1'
2762  Dot product of 'in2' with 'coeff2'
2763  Addition of all the 3 vector results
2764 
2765  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2766 */
2767 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2768 ( { \
2769  v8i16 out0_m; \
2770  \
2771  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2772  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2773  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
2774  \
2775  out0_m; \
2776 } )
2777 
2778 /* Description : Pack even elements of input vectors & xor with 128
2779  Arguments : Inputs - in0, in1
2780  Outputs - out_m
2781  Return Type - unsigned byte
2782  Details : Signed byte even elements from 'in0' and 'in1' are packed
2783  together in one vector and the resulted vector is xor'ed with
2784  128 to shift the range from signed to unsigned byte
2785 */
2786 #define PCKEV_XORI128_UB(in0, in1) \
2787 ( { \
2788  v16u8 out_m; \
2789  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2790  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2791  out_m; \
2792 } )
2793 
2794 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2795  as 8x4 unsigned byte block
2796  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2797 */
2798 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2799  dst0, dst1, pdst, stride) \
2800 { \
2801  v16u8 tmp0_m, tmp1_m; \
2802  uint8_t *pdst_m = (uint8_t *) (pdst); \
2803  \
2804  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2805  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2806  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2807  ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
2808 }
2809 
2810 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2811  of results and store 4 words in destination memory as per
2812  stride
2813  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2814 */
2815 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2816 { \
2817  uint32_t out0_m, out1_m, out2_m, out3_m; \
2818  v16i8 tmp0_m, tmp1_m; \
2819  \
2820  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2821  \
2822  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2823  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2824  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2825  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2826  \
2827  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2828 }
2829 
2830 /* Description : Pack even byte elements and store byte vector in destination
2831  memory
2832  Arguments : Inputs - in0, in1, pdst
2833 */
2834 #define PCKEV_ST_SB(in0, in1, pdst) \
2835 { \
2836  v16i8 tmp_m; \
2837  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2838  ST_SB(tmp_m, (pdst)); \
2839 }
2840 
2841 /* Description : Horizontal 2 tap filter kernel code
2842  Arguments : Inputs - in0, in1, mask, coeff, shift
2843 */
2844 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2845 ( { \
2846  v16i8 tmp0_m; \
2847  v8u16 tmp1_m; \
2848  \
2849  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2850  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2851  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2852  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2853  \
2854  tmp1_m; \
2855 } )
2856 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */