FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 
27 #define ALIGNMENT 16
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
29 
30 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
33 
34 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
35 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
36 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
37 
38 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
39 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
40 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
41 
42 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
43 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
44 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
45 
46 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
47 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
48 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
49 
50 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
51 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
52 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
53 
54 #if (__mips_isa_rev >= 6)
55  #define LW(psrc) \
56  ( { \
57  uint8_t *psrc_m = (uint8_t *) (psrc); \
58  uint32_t val_m; \
59  \
60  __asm__ volatile ( \
61  "lw %[val_m], %[psrc_m] \n\t" \
62  \
63  : [val_m] "=r" (val_m) \
64  : [psrc_m] "m" (*psrc_m) \
65  ); \
66  \
67  val_m; \
68  } )
69 
70  #if (__mips == 64)
71  #define LD(psrc) \
72  ( { \
73  uint8_t *psrc_m = (uint8_t *) (psrc); \
74  uint64_t val_m = 0; \
75  \
76  __asm__ volatile ( \
77  "ld %[val_m], %[psrc_m] \n\t" \
78  \
79  : [val_m] "=r" (val_m) \
80  : [psrc_m] "m" (*psrc_m) \
81  ); \
82  \
83  val_m; \
84  } )
85  #else // !(__mips == 64)
86  #define LD(psrc) \
87  ( { \
88  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
89  uint32_t val0_m, val1_m; \
90  uint64_t val_m = 0; \
91  \
92  val0_m = LW(psrc_ld_m); \
93  val1_m = LW(psrc_ld_m + 4); \
94  \
95  val_m = (uint64_t) (val1_m); \
96  val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
97  val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
98  \
99  val_m; \
100  } )
101  #endif // (__mips == 64)
102 
103  #define SH(val, pdst) \
104  { \
105  uint8_t *pdst_m = (uint8_t *) (pdst); \
106  uint16_t val_m = (val); \
107  \
108  __asm__ volatile ( \
109  "sh %[val_m], %[pdst_m] \n\t" \
110  \
111  : [pdst_m] "=m" (*pdst_m) \
112  : [val_m] "r" (val_m) \
113  ); \
114  }
115 
116  #define SW(val, pdst) \
117  { \
118  uint8_t *pdst_m = (uint8_t *) (pdst); \
119  uint32_t val_m = (val); \
120  \
121  __asm__ volatile ( \
122  "sw %[val_m], %[pdst_m] \n\t" \
123  \
124  : [pdst_m] "=m" (*pdst_m) \
125  : [val_m] "r" (val_m) \
126  ); \
127  }
128 
129  #define SD(val, pdst) \
130  { \
131  uint8_t *pdst_m = (uint8_t *) (pdst); \
132  uint64_t val_m = (val); \
133  \
134  __asm__ volatile ( \
135  "sd %[val_m], %[pdst_m] \n\t" \
136  \
137  : [pdst_m] "=m" (*pdst_m) \
138  : [val_m] "r" (val_m) \
139  ); \
140  }
141 #else // !(__mips_isa_rev >= 6)
142  #define LW(psrc) \
143  ( { \
144  uint8_t *psrc_m = (uint8_t *) (psrc); \
145  uint32_t val_m; \
146  \
147  __asm__ volatile ( \
148  "ulw %[val_m], %[psrc_m] \n\t" \
149  \
150  : [val_m] "=r" (val_m) \
151  : [psrc_m] "m" (*psrc_m) \
152  ); \
153  \
154  val_m; \
155  } )
156 
157  #if (__mips == 64)
158  #define LD(psrc) \
159  ( { \
160  uint8_t *psrc_m = (uint8_t *) (psrc); \
161  uint64_t val_m = 0; \
162  \
163  __asm__ volatile ( \
164  "uld %[val_m], %[psrc_m] \n\t" \
165  \
166  : [val_m] "=r" (val_m) \
167  : [psrc_m] "m" (*psrc_m) \
168  ); \
169  \
170  val_m; \
171  } )
172  #else // !(__mips == 64)
173  #define LD(psrc) \
174  ( { \
175  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
176  uint32_t val0_m, val1_m; \
177  uint64_t val_m = 0; \
178  \
179  val0_m = LW(psrc_ld_m); \
180  val1_m = LW(psrc_ld_m + 4); \
181  \
182  val_m = (uint64_t) (val1_m); \
183  val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
184  val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
185  \
186  val_m; \
187  } )
188  #endif // (__mips == 64)
189 
190  #define SH(val, pdst) \
191  { \
192  uint8_t *pdst_m = (uint8_t *) (pdst); \
193  uint16_t val_m = (val); \
194  \
195  __asm__ volatile ( \
196  "ush %[val_m], %[pdst_m] \n\t" \
197  \
198  : [pdst_m] "=m" (*pdst_m) \
199  : [val_m] "r" (val_m) \
200  ); \
201  }
202 
203  #define SW(val, pdst) \
204  { \
205  uint8_t *pdst_m = (uint8_t *) (pdst); \
206  uint32_t val_m = (val); \
207  \
208  __asm__ volatile ( \
209  "usw %[val_m], %[pdst_m] \n\t" \
210  \
211  : [pdst_m] "=m" (*pdst_m) \
212  : [val_m] "r" (val_m) \
213  ); \
214  }
215 
216  #define SD(val, pdst) \
217  { \
218  uint8_t *pdst_m1 = (uint8_t *) (pdst); \
219  uint32_t val0_m, val1_m; \
220  \
221  val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
222  val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
223  \
224  SW(val0_m, pdst_m1); \
225  SW(val1_m, pdst_m1 + 4); \
226  }
227 #endif // (__mips_isa_rev >= 6)
228 
229 /* Description : Load 4 words with stride
230  Arguments : Inputs - psrc (source pointer to load from)
231  - stride
232  Outputs - out0, out1, out2, out3
233  Details : Loads word in 'out0' from (psrc)
234  Loads word in 'out1' from (psrc + stride)
235  Loads word in 'out2' from (psrc + 2 * stride)
236  Loads word in 'out3' from (psrc + 3 * stride)
237 */
238 #define LW4(psrc, stride, out0, out1, out2, out3) \
239 { \
240  out0 = LW((psrc)); \
241  out1 = LW((psrc) + stride); \
242  out2 = LW((psrc) + 2 * stride); \
243  out3 = LW((psrc) + 3 * stride); \
244 }
245 
246 /* Description : Load double words with stride
247  Arguments : Inputs - psrc (source pointer to load from)
248  - stride
249  Outputs - out0, out1
250  Details : Loads double word in 'out0' from (psrc)
251  Loads double word in 'out1' from (psrc + stride)
252 */
253 #define LD2(psrc, stride, out0, out1) \
254 { \
255  out0 = LD((psrc)); \
256  out1 = LD((psrc) + stride); \
257 }
258 #define LD4(psrc, stride, out0, out1, out2, out3) \
259 { \
260  LD2((psrc), stride, out0, out1); \
261  LD2((psrc) + 2 * stride, stride, out2, out3); \
262 }
263 
264 /* Description : Store 4 words with stride
265  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
266  Details : Stores word from 'in0' to (pdst)
267  Stores word from 'in1' to (pdst + stride)
268  Stores word from 'in2' to (pdst + 2 * stride)
269  Stores word from 'in3' to (pdst + 3 * stride)
270 */
271 #define SW4(in0, in1, in2, in3, pdst, stride) \
272 { \
273  SW(in0, (pdst)) \
274  SW(in1, (pdst) + stride); \
275  SW(in2, (pdst) + 2 * stride); \
276  SW(in3, (pdst) + 3 * stride); \
277 }
278 
279 /* Description : Store 4 double words with stride
280  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
281  Details : Stores double word from 'in0' to (pdst)
282  Stores double word from 'in1' to (pdst + stride)
283  Stores double word from 'in2' to (pdst + 2 * stride)
284  Stores double word from 'in3' to (pdst + 3 * stride)
285 */
286 #define SD4(in0, in1, in2, in3, pdst, stride) \
287 { \
288  SD(in0, (pdst)) \
289  SD(in1, (pdst) + stride); \
290  SD(in2, (pdst) + 2 * stride); \
291  SD(in3, (pdst) + 3 * stride); \
292 }
293 
294 /* Description : Load vectors with 16 byte elements with stride
295  Arguments : Inputs - psrc (source pointer to load from)
296  - stride
297  Outputs - out0, out1
298  Return Type - as per RTYPE
299  Details : Loads 16 byte elements in 'out0' from (psrc)
300  Loads 16 byte elements in 'out1' from (psrc + stride)
301 */
302 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
303 { \
304  out0 = LD_B(RTYPE, (psrc)); \
305  out1 = LD_B(RTYPE, (psrc) + stride); \
306 }
307 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
308 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
309 
310 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
311 { \
312  LD_B2(RTYPE, (psrc), stride, out0, out1); \
313  out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
314 }
315 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
316 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
317 
318 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
319 { \
320  LD_B2(RTYPE, (psrc), stride, out0, out1); \
321  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
322 }
323 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
324 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
325 
326 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
327 { \
328  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
329  out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
330 }
331 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
332 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
333 
334 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
335 { \
336  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
337  LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
338 }
339 #define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__)
340 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
341 
342 #define LD_B7(RTYPE, psrc, stride, \
343  out0, out1, out2, out3, out4, out5, out6) \
344 { \
345  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
346  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
347 }
348 #define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__)
349 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
350 
351 #define LD_B8(RTYPE, psrc, stride, \
352  out0, out1, out2, out3, out4, out5, out6, out7) \
353 { \
354  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
355  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
356 }
357 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
358 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
359 
360 /* Description : Load vectors with 8 halfword elements with stride
361  Arguments : Inputs - psrc (source pointer to load from)
362  - stride
363  Outputs - out0, out1
364  Details : Loads 8 halfword elements in 'out0' from (psrc)
365  Loads 8 halfword elements in 'out1' from (psrc + stride)
366 */
367 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
368 { \
369  out0 = LD_H(RTYPE, (psrc)); \
370  out1 = LD_H(RTYPE, (psrc) + (stride)); \
371 }
372 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
373 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
374 
375 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
376 { \
377  LD_H2(RTYPE, (psrc), stride, out0, out1); \
378  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
379 }
380 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
381 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
382 
383 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
384 { \
385  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
386  LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
387 }
388 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
389 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
390 
391 #define LD_H8(RTYPE, psrc, stride, \
392  out0, out1, out2, out3, out4, out5, out6, out7) \
393 { \
394  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
395  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
396 }
397 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
398 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
399 
400 #define LD_H16(RTYPE, psrc, stride, \
401  out0, out1, out2, out3, out4, out5, out6, out7, \
402  out8, out9, out10, out11, out12, out13, out14, out15) \
403 { \
404  LD_H8(RTYPE, (psrc), stride, \
405  out0, out1, out2, out3, out4, out5, out6, out7); \
406  LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
407  out8, out9, out10, out11, out12, out13, out14, out15); \
408 }
409 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
410 
411 /* Description : Load as 4x4 block of signed halfword elements from 1D source
412  data into 4 vectors (Each vector with 4 signed halfwords)
413  Arguments : Inputs - psrc
414  Outputs - out0, out1, out2, out3
415 */
416 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
417 { \
418  out0 = LD_SH(psrc); \
419  out2 = LD_SH(psrc + 8); \
420  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
421  out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
422 }
423 
424 /* Description : Load 2 vectors of signed word elements with stride
425  Arguments : Inputs - psrc (source pointer to load from)
426  - stride
427  Outputs - out0, out1
428  Return Type - signed word
429 */
430 #define LD_SW2(psrc, stride, out0, out1) \
431 { \
432  out0 = LD_SW((psrc)); \
433  out1 = LD_SW((psrc) + stride); \
434 }
435 
436 /* Description : Store vectors of 16 byte elements with stride
437  Arguments : Inputs - in0, in1, stride
438  Outputs - pdst (destination pointer to store to)
439  Details : Stores 16 byte elements from 'in0' to (pdst)
440  Stores 16 byte elements from 'in1' to (pdst + stride)
441 */
442 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
443 { \
444  ST_B(RTYPE, in0, (pdst)); \
445  ST_B(RTYPE, in1, (pdst) + stride); \
446 }
447 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
448 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
449 
450 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
451 { \
452  ST_B2(RTYPE, in0, in1, (pdst), stride); \
453  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
454 }
455 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
456 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
457 
458 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
459  pdst, stride) \
460 { \
461  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
462  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
463 }
464 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
465 
466 /* Description : Store vectors of 8 halfword elements with stride
467  Arguments : Inputs - in0, in1, stride
468  Outputs - pdst (destination pointer to store to)
469  Details : Stores 8 halfword elements from 'in0' to (pdst)
470  Stores 8 halfword elements from 'in1' to (pdst + stride)
471 */
472 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
473 { \
474  ST_H(RTYPE, in0, (pdst)); \
475  ST_H(RTYPE, in1, (pdst) + stride); \
476 }
477 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
478 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
479 
480 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
481 { \
482  ST_H2(RTYPE, in0, in1, (pdst), stride); \
483  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
484 }
485 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
486 
487 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
488 { \
489  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
490  ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
491 }
492 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
493 
494 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
495 { \
496  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
497  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
498 }
499 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
500 
501 /* Description : Store vectors of word elements with stride
502  Arguments : Inputs - in0, in1, stride
503  Outputs - pdst (destination pointer to store to)
504  Return Type - signed word
505  Details : Stores 4 word elements from 'in0' to (pdst)
506  Stores 4 word elements from 'in1' to (pdst + stride)
507 */
508 #define ST_SW2(in0, in1, pdst, stride) \
509 { \
510  ST_SW(in0, (pdst)); \
511  ST_SW(in1, (pdst) + stride); \
512 }
513 #define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7, \
514  pdst, stride) \
515 { \
516  ST_SW2(in0, in1, (pdst), stride); \
517  ST_SW2(in2, in3, (pdst) + 2 * stride, stride); \
518  ST_SW2(in4, in5, (pdst) + 4 * stride, stride); \
519  ST_SW2(in6, in7, (pdst) + 6 * stride, stride); \
520 }
521 
522 /* Description : Store as 2x4 byte block to destination memory from input vector
523  Arguments : Inputs - in, stidx, pdst, stride
524  Return Type - unsigned byte
525  Details : Index stidx halfword element from 'in' vector is copied and
526  stored on first line
527  Index stidx+1 halfword element from 'in' vector is copied and
528  stored on second line
529  Index stidx+2 halfword element from 'in' vector is copied and
530  stored on third line
531  Index stidx+3 halfword element from 'in' vector is copied and
532  stored on fourth line
533 */
534 #define ST2x4_UB(in, stidx, pdst, stride) \
535 { \
536  uint16_t out0_m, out1_m, out2_m, out3_m; \
537  uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
538  \
539  out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
540  out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
541  out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
542  out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
543  \
544  SH(out0_m, pblk_2x4_m); \
545  SH(out1_m, pblk_2x4_m + stride); \
546  SH(out2_m, pblk_2x4_m + 2 * stride); \
547  SH(out3_m, pblk_2x4_m + 3 * stride); \
548 }
549 
550 /* Description : Store as 4x2 byte block to destination memory from input vector
551  Arguments : Inputs - in, pdst, stride
552  Return Type - unsigned byte
553  Details : Index 0 word element from input vector is copied and stored
554  on first line
555  Index 1 word element from input vector is copied and stored
556  on second line
557 */
558 #define ST4x2_UB(in, pdst, stride) \
559 { \
560  uint32_t out0_m, out1_m; \
561  uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
562  \
563  out0_m = __msa_copy_u_w((v4i32) in, 0); \
564  out1_m = __msa_copy_u_w((v4i32) in, 1); \
565  \
566  SW(out0_m, pblk_4x2_m); \
567  SW(out1_m, pblk_4x2_m + stride); \
568 }
569 
570 /* Description : Store as 4x4 byte block to destination memory from input vector
571  Arguments : Inputs - in0, in1, pdst, stride
572  Return Type - unsigned byte
573  Details : Idx0 word element from input vector 'in0' is copied and stored
574  on first line
575  Idx1 word element from input vector 'in0' is copied and stored
576  on second line
577  Idx2 word element from input vector 'in1' is copied and stored
578  on third line
579  Idx3 word element from input vector 'in1' is copied and stored
580  on fourth line
581 */
582 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
583 { \
584  uint32_t out0_m, out1_m, out2_m, out3_m; \
585  uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
586  \
587  out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
588  out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
589  out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
590  out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
591  \
592  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
593 }
594 #define ST4x8_UB(in0, in1, pdst, stride) \
595 { \
596  uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
597  \
598  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
599  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
600 }
601 
602 /* Description : Store as 6x4 byte block to destination memory from input
603  vectors
604  Arguments : Inputs - in0, in1, pdst, stride
605  Return Type - unsigned byte
606  Details : Index 0 word element from input vector 'in0' is copied and
607  stored on first line followed by index 2 halfword element
608  Index 2 word element from input vector 'in0' is copied and
609  stored on second line followed by index 2 halfword element
610  Index 0 word element from input vector 'in1' is copied and
611  stored on third line followed by index 2 halfword element
612  Index 2 word element from input vector 'in1' is copied and
613  stored on fourth line followed by index 2 halfword element
614 */
615 #define ST6x4_UB(in0, in1, pdst, stride) \
616 { \
617  uint32_t out0_m, out1_m, out2_m, out3_m; \
618  uint16_t out4_m, out5_m, out6_m, out7_m; \
619  uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
620  \
621  out0_m = __msa_copy_u_w((v4i32) in0, 0); \
622  out1_m = __msa_copy_u_w((v4i32) in0, 2); \
623  out2_m = __msa_copy_u_w((v4i32) in1, 0); \
624  out3_m = __msa_copy_u_w((v4i32) in1, 2); \
625  \
626  out4_m = __msa_copy_u_h((v8i16) in0, 2); \
627  out5_m = __msa_copy_u_h((v8i16) in0, 6); \
628  out6_m = __msa_copy_u_h((v8i16) in1, 2); \
629  out7_m = __msa_copy_u_h((v8i16) in1, 6); \
630  \
631  SW(out0_m, pblk_6x4_m); \
632  SH(out4_m, (pblk_6x4_m + 4)); \
633  pblk_6x4_m += stride; \
634  SW(out1_m, pblk_6x4_m); \
635  SH(out5_m, (pblk_6x4_m + 4)); \
636  pblk_6x4_m += stride; \
637  SW(out2_m, pblk_6x4_m); \
638  SH(out6_m, (pblk_6x4_m + 4)); \
639  pblk_6x4_m += stride; \
640  SW(out3_m, pblk_6x4_m); \
641  SH(out7_m, (pblk_6x4_m + 4)); \
642 }
643 
644 /* Description : Store as 8x1 byte block to destination memory from input vector
645  Arguments : Inputs - in, pdst
646  Details : Index 0 double word element from input vector 'in' is copied
647  and stored to destination memory at (pdst)
648 */
649 #define ST8x1_UB(in, pdst) \
650 { \
651  uint64_t out0_m; \
652  out0_m = __msa_copy_u_d((v2i64) in, 0); \
653  SD(out0_m, pdst); \
654 }
655 
656 /* Description : Store as 8x2 byte block to destination memory from input vector
657  Arguments : Inputs - in, pdst, stride
658  Details : Index 0 double word element from input vector 'in' is copied
659  and stored to destination memory at (pdst)
660  Index 1 double word element from input vector 'in' is copied
661  and stored to destination memory at (pdst + stride)
662 */
663 #define ST8x2_UB(in, pdst, stride) \
664 { \
665  uint64_t out0_m, out1_m; \
666  uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
667  \
668  out0_m = __msa_copy_u_d((v2i64) in, 0); \
669  out1_m = __msa_copy_u_d((v2i64) in, 1); \
670  \
671  SD(out0_m, pblk_8x2_m); \
672  SD(out1_m, pblk_8x2_m + stride); \
673 }
674 
675 /* Description : Store as 8x4 byte block to destination memory from input
676  vectors
677  Arguments : Inputs - in0, in1, pdst, stride
678  Details : Index 0 double word element from input vector 'in0' is copied
679  and stored to destination memory at (pblk_8x4_m)
680  Index 1 double word element from input vector 'in0' is copied
681  and stored to destination memory at (pblk_8x4_m + stride)
682  Index 0 double word element from input vector 'in1' is copied
683  and stored to destination memory at (pblk_8x4_m + 2 * stride)
684  Index 1 double word element from input vector 'in1' is copied
685  and stored to destination memory at (pblk_8x4_m + 3 * stride)
686 */
687 #define ST8x4_UB(in0, in1, pdst, stride) \
688 { \
689  uint64_t out0_m, out1_m, out2_m, out3_m; \
690  uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
691  \
692  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
693  out1_m = __msa_copy_u_d((v2i64) in0, 1); \
694  out2_m = __msa_copy_u_d((v2i64) in1, 0); \
695  out3_m = __msa_copy_u_d((v2i64) in1, 1); \
696  \
697  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
698 }
699 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
700 { \
701  uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
702  \
703  ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
704  ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
705 }
706 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
707 { \
708  uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
709  \
710  /* left 8x4 */ \
711  ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
712  /* right 4x4 */ \
713  ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
714 }
715 
716 /* Description : Store as 12x8 byte block to destination memory from
717  input vectors
718  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
719  Details : Index 0 double word element from input vector 'in0' is copied
720  and stored to destination memory at (pblk_12x8_m) followed by
721  index 2 word element from same input vector 'in0' at
722  (pblk_12x8_m + 8)
723  Similar to remaining lines
724 */
725 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
726 { \
727  uint64_t out0_m, out1_m, out2_m, out3_m; \
728  uint64_t out4_m, out5_m, out6_m, out7_m; \
729  uint32_t out8_m, out9_m, out10_m, out11_m; \
730  uint32_t out12_m, out13_m, out14_m, out15_m; \
731  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
732  \
733  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
734  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
735  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
736  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
737  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
738  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
739  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
740  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
741  \
742  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
743  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
744  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
745  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
746  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
747  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
748  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
749  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
750  \
751  SD(out0_m, pblk_12x8_m); \
752  SW(out8_m, pblk_12x8_m + 8); \
753  pblk_12x8_m += stride; \
754  SD(out1_m, pblk_12x8_m); \
755  SW(out9_m, pblk_12x8_m + 8); \
756  pblk_12x8_m += stride; \
757  SD(out2_m, pblk_12x8_m); \
758  SW(out10_m, pblk_12x8_m + 8); \
759  pblk_12x8_m += stride; \
760  SD(out3_m, pblk_12x8_m); \
761  SW(out11_m, pblk_12x8_m + 8); \
762  pblk_12x8_m += stride; \
763  SD(out4_m, pblk_12x8_m); \
764  SW(out12_m, pblk_12x8_m + 8); \
765  pblk_12x8_m += stride; \
766  SD(out5_m, pblk_12x8_m); \
767  SW(out13_m, pblk_12x8_m + 8); \
768  pblk_12x8_m += stride; \
769  SD(out6_m, pblk_12x8_m); \
770  SW(out14_m, pblk_12x8_m + 8); \
771  pblk_12x8_m += stride; \
772  SD(out7_m, pblk_12x8_m); \
773  SW(out15_m, pblk_12x8_m + 8); \
774 }
775 
776 /* Description : average with rounding (in0 + in1 + 1) / 2.
777  Arguments : Inputs - in0, in1, in2, in3,
778  Outputs - out0, out1
779  Return Type - signed byte
780  Details : Each byte element from 'in0' vector is added with each byte
781  element from 'in1' vector. The addition of the elements plus 1
782  (for rounding) is done unsigned with full precision,
783  i.e. the result has one extra bit. Unsigned division by 2
784  (or logical shift right by one bit) is performed before writing
785  the result to vector 'out0'
786  Similar for the pair of 'in2' and 'in3'
787 */
788 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
789 { \
790  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
791  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
792 }
793 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
794 
795 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
796  out0, out1, out2, out3) \
797 { \
798  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
799  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
800 }
801 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
802 
803 /* Description : Immediate number of columns to slide with zero
804  Arguments : Inputs - in0, in1, slide_val
805  Outputs - out0, out1
806  Return Type - as per RTYPE
807  Details : Byte elements from 'zero_m' vector are slide into 'in0' by
808  number of elements specified by 'slide_val'
809 */
810 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
811 { \
812  v16i8 zero_m = { 0 }; \
813  out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
814  out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
815 }
816 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
817 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
818 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
819 
820 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
821 { \
822  v16i8 zero_m = { 0 }; \
823  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
824  out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
825 }
826 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
827 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
828 
829 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
830  out0, out1, out2, out3, slide_val) \
831 { \
832  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
833  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
834 }
835 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
836 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
837 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
838 
839 /* Description : Immediate number of columns to slide
840  Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
841  Outputs - out0, out1
842  Return Type - as per RTYPE
843  Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
844  number of elements specified by 'slide_val'
845 */
846 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
847 { \
848  out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
849  out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
850 }
851 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
852 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
853 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
854 
855 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
856  out0, out1, out2, slide_val) \
857 { \
858  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
859  out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
860 }
861 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
862 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
863 
864 /* Description : Shuffle byte vector elements as per mask vector
865  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
866  Outputs - out0, out1
867  Return Type - as per RTYPE
868  Details : Selective byte elements from in0 & in1 are copied to out0 as
869  per control vector mask0
870  Selective byte elements from in2 & in3 are copied to out1 as
871  per control vector mask1
872 */
873 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
874 { \
875  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
876  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
877 }
878 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
879 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
880 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
881 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
882 
883 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
884  out0, out1, out2) \
885 { \
886  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
887  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
888 }
889 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
890 
891 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
892  out0, out1, out2, out3) \
893 { \
894  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
895  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
896 }
897 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
898 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
899 
900 /* Description : Shuffle halfword vector elements as per mask vector
901  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
902  Outputs - out0, out1
903  Return Type - as per RTYPE
904  Details : Selective halfword elements from in0 & in1 are copied to out0
905  as per control vector mask0
906  Selective halfword elements from in2 & in3 are copied to out1
907  as per control vector mask1
908 */
909 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
910 { \
911  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
912  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
913 }
914 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
915 
916 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
917  out0, out1, out2) \
918 { \
919  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
920  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
921 }
922 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
923 
924 /* Description : Shuffle byte vector elements as per mask vector
925  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
926  Outputs - out0, out1
927  Return Type - as per RTYPE
928  Details : Selective byte elements from in0 & in1 are copied to out0 as
929  per control vector mask0
930  Selective byte elements from in2 & in3 are copied to out1 as
931  per control vector mask1
932 */
933 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
934 { \
935  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
936  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
937 }
938 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
939 
940 /* Description : Dot product of byte vector elements
941  Arguments : Inputs - mult0, mult1
942  cnst0, cnst1
943  Outputs - out0, out1
944  Return Type - unsigned halfword
945  Details : Unsigned byte elements from mult0 are multiplied with
946  unsigned byte elements from cnst0 producing a result
947  twice the size of input i.e. unsigned halfword.
948  Then this multiplication results of adjacent odd-even elements
949  are added together and stored to the out vector
950  (2 unsigned halfword results)
951 */
952 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
953 { \
954  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
955  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
956 }
957 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
958 
959 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
960  cnst0, cnst1, cnst2, cnst3, \
961  out0, out1, out2, out3) \
962 { \
963  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
964  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
965 }
966 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
967 
968 /* Description : Dot product of byte vector elements
969  Arguments : Inputs - mult0, mult1
970  cnst0, cnst1
971  Outputs - out0, out1
972  Return Type - signed halfword
973  Details : Signed byte elements from mult0 are multiplied with
974  signed byte elements from cnst0 producing a result
975  twice the size of input i.e. signed halfword.
976  Then this multiplication results of adjacent odd-even elements
977  are added together and stored to the out vector
978  (2 signed halfword results)
979 */
980 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
981 { \
982  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
983  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
984 }
985 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
986 
987 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
988  out0, out1, out2) \
989 { \
990  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
991  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
992 }
993 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
994 
995 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
996  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
997 { \
998  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
999  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1000 }
1001 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
1002 
1003 /* Description : Dot product of halfword vector elements
1004  Arguments : Inputs - mult0, mult1
1005  cnst0, cnst1
1006  Outputs - out0, out1
1007  Return Type - signed word
1008  Details : Signed halfword elements from mult0 are multiplied with
1009  signed halfword elements from cnst0 producing a result
1010  twice the size of input i.e. signed word.
1011  Then this multiplication results of adjacent odd-even elements
1012  are added together and stored to the out vector
1013  (2 signed word results)
1014 */
1015 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1016 { \
1017  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
1018  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
1019 }
1020 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
1021 
1022 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
1023  cnst0, cnst1, cnst2, cnst3, \
1024  out0, out1, out2, out3) \
1025 { \
1026  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1027  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1028 }
1029 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
1030 
1031 /* Description : Dot product & addition of byte vector elements
1032  Arguments : Inputs - mult0, mult1
1033  cnst0, cnst1
1034  Outputs - out0, out1
1035  Return Type - signed halfword
1036  Details : Signed byte elements from mult0 are multiplied with
1037  signed byte elements from cnst0 producing a result
1038  twice the size of input i.e. signed halfword.
1039  Then this multiplication results of adjacent odd-even elements
1040  are added to the out vector
1041  (2 signed halfword results)
1042 */
1043 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1044 { \
1045  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
1046  (v16i8) mult0, (v16i8) cnst0); \
1047  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
1048  (v16i8) mult1, (v16i8) cnst1); \
1049 }
1050 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
1051 
1052 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
1053  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
1054 { \
1055  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1056  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1057 }
1058 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
1059 
1060 /* Description : Dot product & addition of byte vector elements
1061  Arguments : Inputs - mult0, mult1
1062  cnst0, cnst1
1063  Outputs - out0, out1
1064  Return Type - unsigned halfword
1065  Details : Unsigned byte elements from mult0 are multiplied with
1066  unsigned byte elements from cnst0 producing a result
1067  twice the size of input i.e. unsigned halfword.
1068  Then this multiplication results of adjacent odd-even elements
1069  are added to the out vector
1070  (2 unsigned halfword results)
1071 */
1072 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1073 { \
1074  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
1075  (v16u8) mult0, (v16u8) cnst0); \
1076  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
1077  (v16u8) mult1, (v16u8) cnst1); \
1078 }
1079 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
1080 
1081 /* Description : Dot product & addition of halfword vector elements
1082  Arguments : Inputs - mult0, mult1
1083  cnst0, cnst1
1084  Outputs - out0, out1
1085  Return Type - signed word
1086  Details : Signed halfword elements from mult0 are multiplied with
1087  signed halfword elements from cnst0 producing a result
1088  twice the size of input i.e. signed word.
1089  Then this multiplication results of adjacent odd-even elements
1090  are added to the out vector
1091  (2 signed word results)
1092 */
1093 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1094 { \
1095  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
1096  (v8i16) mult0, (v8i16) cnst0); \
1097  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
1098  (v8i16) mult1, (v8i16) cnst1); \
1099 }
1100 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
1101 
1102 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
1103  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
1104 { \
1105  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1106  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1107 }
1108 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
1109 
1110 /* Description : Minimum values between unsigned elements of
1111  either vector are copied to the output vector
1112  Arguments : Inputs - in0, in1, min_vec
1113  Outputs - in0, in1, (in place)
1114  Return Type - unsigned halfword
1115  Details : Minimum of unsigned halfword element values from 'in0' and
1116  'min_value' are written to output vector 'in0'
1117 */
1118 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
1119 { \
1120  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
1121  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
1122 }
1123 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1124 
1125 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1126 { \
1127  MIN_UH2(RTYPE, in0, in1, min_vec); \
1128  MIN_UH2(RTYPE, in2, in3, min_vec); \
1129 }
1130 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1131 
1132 /* Description : Clips all halfword elements of input vector between min & max
1133  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1134  Arguments : Inputs - in (input vector)
1135  - min (min threshold)
1136  - max (max threshold)
1137  Outputs - out_m (output vector with clipped elements)
1138  Return Type - signed halfword
1139 */
1140 #define CLIP_SH(in, min, max) \
1141 ( { \
1142  v8i16 out_m; \
1143  \
1144  out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1145  out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1146  out_m; \
1147 } )
1148 
1149 /* Description : Clips all signed halfword elements of input vector
1150  between 0 & 255
1151  Arguments : Inputs - in (input vector)
1152  Outputs - out_m (output vector with clipped elements)
1153  Return Type - signed halfword
1154 */
1155 #define CLIP_SH_0_255(in) \
1156 ( { \
1157  v8i16 max_m = __msa_ldi_h(255); \
1158  v8i16 out_m; \
1159  \
1160  out_m = __msa_maxi_s_h((v8i16) in, 0); \
1161  out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1162  out_m; \
1163 } )
1164 #define CLIP_SH2_0_255(in0, in1) \
1165 { \
1166  in0 = CLIP_SH_0_255(in0); \
1167  in1 = CLIP_SH_0_255(in1); \
1168 }
1169 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1170 { \
1171  CLIP_SH2_0_255(in0, in1); \
1172  CLIP_SH2_0_255(in2, in3); \
1173 }
1174 
1175 /* Description : Clips all signed word elements of input vector
1176  between 0 & 255
1177  Arguments : Inputs - in (input vector)
1178  Outputs - out_m (output vector with clipped elements)
1179  Return Type - signed word
1180 */
1181 #define CLIP_SW_0_255(in) \
1182 ( { \
1183  v4i32 max_m = __msa_ldi_w(255); \
1184  v4i32 out_m; \
1185  \
1186  out_m = __msa_maxi_s_w((v4i32) in, 0); \
1187  out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1188  out_m; \
1189 } )
1190 
1191 /* Description : Addition of 4 signed word elements
1192  4 signed word elements of input vector are added together and
1193  resulted integer sum is returned
1194  Arguments : Inputs - in (signed word vector)
1195  Outputs - sum_m (i32 sum)
1196  Return Type - signed word
1197 */
1198 #define HADD_SW_S32(in) \
1199 ( { \
1200  v2i64 res0_m, res1_m; \
1201  int32_t sum_m; \
1202  \
1203  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1204  res1_m = __msa_splati_d(res0_m, 1); \
1205  res0_m = res0_m + res1_m; \
1206  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1207  sum_m; \
1208 } )
1209 
1210 /* Description : Addition of 8 unsigned halfword elements
1211  8 unsigned halfword elements of input vector are added
1212  together and resulted integer sum is returned
1213  Arguments : Inputs - in (unsigned halfword vector)
1214  Outputs - sum_m (u32 sum)
1215  Return Type - unsigned word
1216 */
1217 #define HADD_UH_U32(in) \
1218 ( { \
1219  v4u32 res_m; \
1220  v2u64 res0_m, res1_m; \
1221  uint32_t sum_m; \
1222  \
1223  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1224  res0_m = __msa_hadd_u_d(res_m, res_m); \
1225  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1226  res0_m = res0_m + res1_m; \
1227  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1228  sum_m; \
1229 } )
1230 
1231 /* Description : Horizontal addition of signed byte vector elements
1232  Arguments : Inputs - in0, in1
1233  Outputs - out0, out1
1234  Return Type - as per RTYPE
1235  Details : Each signed odd byte element from 'in0' is added to
1236  even signed byte element from 'in0' (pairwise) and the
1237  halfword result is stored in 'out0'
1238 */
1239 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1240 { \
1241  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1242  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1243 }
1244 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1245 
1246 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1247 { \
1248  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1249  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1250 }
1251 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1252 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1253 
1254 /* Description : Horizontal addition of unsigned byte vector elements
1255  Arguments : Inputs - in0, in1
1256  Outputs - out0, out1
1257  Return Type - as per RTYPE
1258  Details : Each unsigned odd byte element from 'in0' is added to
1259  even unsigned byte element from 'in0' (pairwise) and the
1260  halfword result is stored in 'out0'
1261 */
1262 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1263 { \
1264  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1265  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1266 }
1267 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1268 
1269 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1270 { \
1271  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1272  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1273 }
1274 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1275 
1276 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1277 { \
1278  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1279  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1280 }
1281 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1282 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1283 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1284 
1285 /* Description : Horizontal subtraction of unsigned byte vector elements
1286  Arguments : Inputs - in0, in1
1287  Outputs - out0, out1
1288  Return Type - as per RTYPE
1289  Details : Each unsigned odd byte element from 'in0' is subtracted from
1290  even unsigned byte element from 'in0' (pairwise) and the
1291  halfword result is stored in 'out0'
1292 */
1293 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1294 { \
1295  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1296  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1297 }
1298 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1299 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1300 
1301 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1302 { \
1303  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1304  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1305 }
1306 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1307 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1308 
1309 /* Description : SAD (Sum of Absolute Difference)
1310  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1311  Outputs - sad_m (halfword vector with sad)
1312  Return Type - unsigned halfword
1313  Details : Absolute difference of all the byte elements from 'in0' with
1314  'ref0' is calculated and preserved in 'diff0'. From the 16
1315  unsigned absolute diff values, even-odd pairs are added
1316  together to generate 8 halfword results.
1317 */
1318 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1319 ( { \
1320  v16u8 diff0_m, diff1_m; \
1321  v8u16 sad_m = { 0 }; \
1322  \
1323  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1324  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1325  \
1326  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1327  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1328  \
1329  sad_m; \
1330 } )
1331 
1332 /* Description : Insert specified word elements from input vectors to 1
1333  destination vector
1334  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1335  Outputs - out (output vector)
1336  Return Type - as per RTYPE
1337 */
1338 #define INSERT_W2(RTYPE, in0, in1, out) \
1339 { \
1340  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1341  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1342 }
1343 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1344 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1345 
1346 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1347 { \
1348  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1349  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1350  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1351  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1352 }
1353 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1354 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1355 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1356 
1357 /* Description : Insert specified double word elements from input vectors to 1
1358  destination vector
1359  Arguments : Inputs - in0, in1 (2 input vectors)
1360  Outputs - out (output vector)
1361  Return Type - as per RTYPE
1362 */
1363 #define INSERT_D2(RTYPE, in0, in1, out) \
1364 { \
1365  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1366  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1367 }
1368 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1369 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1370 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1371 
1372 /* Description : Interleave even byte elements from vectors
1373  Arguments : Inputs - in0, in1, in2, in3
1374  Outputs - out0, out1
1375  Return Type - as per RTYPE
1376  Details : Even byte elements of 'in0' and even byte
1377  elements of 'in1' are interleaved and copied to 'out0'
1378  Even byte elements of 'in2' and even byte
1379  elements of 'in3' are interleaved and copied to 'out1'
1380 */
1381 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1382 { \
1383  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1384  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1385 }
1386 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1387 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1388 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1389 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1390 
1391 /* Description : Interleave even halfword elements from vectors
1392  Arguments : Inputs - in0, in1, in2, in3
1393  Outputs - out0, out1
1394  Return Type - as per RTYPE
1395  Details : Even halfword elements of 'in0' and even halfword
1396  elements of 'in1' are interleaved and copied to 'out0'
1397  Even halfword elements of 'in2' and even halfword
1398  elements of 'in3' are interleaved and copied to 'out1'
1399 */
1400 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1401 { \
1402  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1403  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1404 }
1405 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1406 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1407 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1408 
1409 /* Description : Interleave even word elements from vectors
1410  Arguments : Inputs - in0, in1, in2, in3
1411  Outputs - out0, out1
1412  Return Type - as per RTYPE
1413  Details : Even word elements of 'in0' and even word
1414  elements of 'in1' are interleaved and copied to 'out0'
1415  Even word elements of 'in2' and even word
1416  elements of 'in3' are interleaved and copied to 'out1'
1417 */
1418 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1419 { \
1420  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1421  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1422 }
1423 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1424 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1425 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1426 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1427 
1428 /* Description : Interleave even double word elements from vectors
1429  Arguments : Inputs - in0, in1, in2, in3
1430  Outputs - out0, out1
1431  Return Type - as per RTYPE
1432  Details : Even double word elements of 'in0' and even double word
1433  elements of 'in1' are interleaved and copied to 'out0'
1434  Even double word elements of 'in2' and even double word
1435  elements of 'in3' are interleaved and copied to 'out1'
1436 */
1437 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1438 { \
1439  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1440  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1441 }
1442 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1443 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1444 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1445 
1446 /* Description : Interleave left half of byte elements from vectors
1447  Arguments : Inputs - in0, in1, in2, in3
1448  Outputs - out0, out1
1449  Return Type - as per RTYPE
1450  Details : Left half of byte elements of in0 and left half of byte
1451  elements of in1 are interleaved and copied to out0.
1452  Left half of byte elements of in2 and left half of byte
1453  elements of in3 are interleaved and copied to out1.
1454 */
1455 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1456 { \
1457  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1458  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1459 }
1460 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1461 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1462 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1463 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1464 
1465 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1466  out0, out1, out2, out3) \
1467 { \
1468  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1469  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1470 }
1471 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1472 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1473 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1474 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1475 
1476 /* Description : Interleave left half of halfword elements from vectors
1477  Arguments : Inputs - in0, in1, in2, in3
1478  Outputs - out0, out1
1479  Return Type - as per RTYPE
1480  Details : Left half of halfword elements of in0 and left half of halfword
1481  elements of in1 are interleaved and copied to out0.
1482  Left half of halfword elements of in2 and left half of halfword
1483  elements of in3 are interleaved and copied to out1.
1484 */
1485 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1486 { \
1487  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1488  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1489 }
1490 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1491 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1492 
1493 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1494  out0, out1, out2, out3) \
1495 { \
1496  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1497  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1498 }
1499 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1500 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1501 
1502 /* Description : Interleave left half of word elements from vectors
1503  Arguments : Inputs - in0, in1, in2, in3
1504  Outputs - out0, out1
1505  Return Type - as per RTYPE
1506  Details : Left half of word elements of in0 and left half of word
1507  elements of in1 are interleaved and copied to out0.
1508  Left half of word elements of in2 and left half of word
1509  elements of in3 are interleaved and copied to out1.
1510 */
1511 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1512 { \
1513  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1514  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1515 }
1516 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1517 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1518 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1519 
1520 /* Description : Interleave right half of byte elements from vectors
1521  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1522  Outputs - out0, out1, out2, out3
1523  Return Type - as per RTYPE
1524  Details : Right half of byte elements of in0 and right half of byte
1525  elements of in1 are interleaved and copied to out0.
1526  Right half of byte elements of in2 and right half of byte
1527  elements of in3 are interleaved and copied to out1.
1528  Similar for other pairs
1529 */
1530 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1531 { \
1532  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1533  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1534 }
1535 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1536 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1537 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1538 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1539 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1540 
1541 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1542 { \
1543  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1544  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1545 }
1546 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1547 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1548 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1549 
1550 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1551  out0, out1, out2, out3) \
1552 { \
1553  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1554  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1555 }
1556 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1557 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1558 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1559 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1560 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1561 
1562 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1563  in8, in9, in10, in11, in12, in13, in14, in15, \
1564  out0, out1, out2, out3, out4, out5, out6, out7) \
1565 { \
1566  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1567  out0, out1, out2, out3); \
1568  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1569  out4, out5, out6, out7); \
1570 }
1571 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1572 
1573 /* Description : Interleave right half of halfword elements from vectors
1574  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1575  Outputs - out0, out1, out2, out3
1576  Return Type - signed halfword
1577  Details : Right half of halfword elements of in0 and right half of
1578  halfword elements of in1 are interleaved and copied to out0.
1579  Right half of halfword elements of in2 and right half of
1580  halfword elements of in3 are interleaved and copied to out1.
1581  Similar for other pairs
1582 */
1583 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1584 { \
1585  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1586  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1587 }
1588 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1589 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1590 
1591 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1592 { \
1593  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1594  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1595 }
1596 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1597 
1598 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1599  out0, out1, out2, out3) \
1600 { \
1601  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1602  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1603 }
1604 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1605 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1606 
1607 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1608 { \
1609  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1610  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1611 }
1612 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1613 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1614 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1615 
1616 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1617  out0, out1, out2, out3) \
1618 { \
1619  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1620  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1621 }
1622 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1623 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1624 
1625 /* Description : Interleave right half of double word elements from vectors
1626  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1627  Outputs - out0, out1, out2, out3
1628  Return Type - unsigned double word
1629  Details : Right half of double word elements of in0 and right half of
1630  double word elements of in1 are interleaved and copied to out0.
1631  Right half of double word elements of in2 and right half of
1632  double word elements of in3 are interleaved and copied to out1.
1633 */
1634 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1635 { \
1636  out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1637  out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1638 }
1639 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1640 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1641 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1642 
1643 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1644 { \
1645  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1646  out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1647 }
1648 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1649 
1650 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1651  out0, out1, out2, out3) \
1652 { \
1653  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1654  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1655 }
1656 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1657 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1658 
1659 /* Description : Interleave both left and right half of input vectors
1660  Arguments : Inputs - in0, in1
1661  Outputs - out0, out1
1662  Return Type - as per RTYPE
1663  Details : Right half of byte elements from 'in0' and 'in1' are
1664  interleaved and stored to 'out0'
1665  Left half of byte elements from 'in0' and 'in1' are
1666  interleaved and stored to 'out1'
1667 */
1668 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1669 { \
1670  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1671  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1672 }
1673 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1674 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1675 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1676 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1677 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1678 
1679 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1680 { \
1681  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1682  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1683 }
1684 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1685 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1686 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1687 
1688 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1689 { \
1690  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1691  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1692 }
1693 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1694 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1695 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1696 
1697 /* Description : Maximum values between signed elements of vector and
1698  5-bit signed immediate value are copied to the output vector
1699  Arguments : Inputs - in0, in1, in2, in3, max_val
1700  Outputs - in0, in1, in2, in3 (in place)
1701  Return Type - unsigned halfword
1702  Details : Maximum of signed halfword element values from 'in0' and
1703  'max_val' are written to output vector 'in0'
1704 */
1705 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1706 { \
1707  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1708  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1709 }
1710 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1711 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1712 
1713 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1714 { \
1715  MAXI_SH2(RTYPE, in0, in1, max_val); \
1716  MAXI_SH2(RTYPE, in2, in3, max_val); \
1717 }
1718 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1719 
1720 /* Description : Saturate the halfword element values to the max
1721  unsigned value of (sat_val+1 bits)
1722  The element data width remains unchanged
1723  Arguments : Inputs - in0, in1, in2, in3, sat_val
1724  Outputs - in0, in1, in2, in3 (in place)
1725  Return Type - unsigned halfword
1726  Details : Each unsigned halfword element from 'in0' is saturated to the
1727  value generated with (sat_val+1) bit range
1728  Results are in placed to original vectors
1729 */
1730 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1731 { \
1732  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1733  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1734 }
1735 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1736 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1737 
1738 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1739 { \
1740  SAT_UH2(RTYPE, in0, in1, sat_val); \
1741  SAT_UH2(RTYPE, in2, in3, sat_val) \
1742 }
1743 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1744 
1745 /* Description : Saturate the halfword element values to the max
1746  unsigned value of (sat_val+1 bits)
1747  The element data width remains unchanged
1748  Arguments : Inputs - in0, in1, in2, in3, sat_val
1749  Outputs - in0, in1, in2, in3 (in place)
1750  Return Type - unsigned halfword
1751  Details : Each unsigned halfword element from 'in0' is saturated to the
1752  value generated with (sat_val+1) bit range
1753  Results are in placed to original vectors
1754 */
1755 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1756 { \
1757  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1758  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1759 }
1760 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1761 
1762 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1763 { \
1764  SAT_SH2(RTYPE, in0, in1, sat_val) \
1765  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1766 }
1767 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1768 
1769 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1770 { \
1771  SAT_SH2(RTYPE, in0, in1, sat_val); \
1772  SAT_SH2(RTYPE, in2, in3, sat_val); \
1773 }
1774 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1775 
1776 /* Description : Saturate the word element values to the max
1777  unsigned value of (sat_val+1 bits)
1778  The element data width remains unchanged
1779  Arguments : Inputs - in0, in1, in2, in3, sat_val
1780  Outputs - in0, in1, in2, in3 (in place)
1781  Return Type - unsigned word
1782  Details : Each unsigned word element from 'in0' is saturated to the
1783  value generated with (sat_val+1) bit range
1784  Results are in placed to original vectors
1785 */
1786 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1787 { \
1788  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1789  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1790 }
1791 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1792 
1793 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1794 { \
1795  SAT_SW2(RTYPE, in0, in1, sat_val); \
1796  SAT_SW2(RTYPE, in2, in3, sat_val); \
1797 }
1798 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1799 
1800 /* Description : Indexed halfword element values are replicated to all
1801  elements in output vector
1802  Arguments : Inputs - in, idx0, idx1
1803  Outputs - out0, out1
1804  Return Type - as per RTYPE
1805  Details : 'idx0' element value from 'in' vector is replicated to all
1806  elements in 'out0' vector
1807  Valid index range for halfword operation is 0-7
1808 */
1809 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1810 { \
1811  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1812  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1813 }
1814 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1815 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1816 
1817 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1818  out0, out1, out2) \
1819 { \
1820  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1821  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1822 }
1823 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1824 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1825 
1826 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1827  out0, out1, out2, out3) \
1828 { \
1829  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1830  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1831 }
1832 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1833 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1834 
1835 /* Description : Indexed word element values are replicated to all
1836  elements in output vector
1837  Arguments : Inputs - in, stidx
1838  Outputs - out0, out1
1839  Return Type - as per RTYPE
1840  Details : 'stidx' element value from 'in' vector is replicated to all
1841  elements in 'out0' vector
1842  'stidx + 1' element value from 'in' vector is replicated to all
1843  elements in 'out1' vector
1844  Valid index range for halfword operation is 0-3
1845 */
1846 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1847 { \
1848  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1849  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1850 }
1851 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1852 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1853 
1854 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1855 { \
1856  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1857  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1858 }
1859 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1860 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1861 
1862 /* Description : Pack even byte elements of vector pairs
1863  Arguments : Inputs - in0, in1, in2, in3
1864  Outputs - out0, out1
1865  Return Type - as per RTYPE
1866  Details : Even byte elements of in0 are copied to the left half of
1867  out0 & even byte elements of in1 are copied to the right
1868  half of out0.
1869  Even byte elements of in2 are copied to the left half of
1870  out1 & even byte elements of in3 are copied to the right
1871  half of out1.
1872 */
1873 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1874 { \
1875  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1876  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1877 }
1878 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1879 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1880 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1881 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1882 
1883 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1884 { \
1885  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1886  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1887 }
1888 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1889 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1890 
1891 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1892  out0, out1, out2, out3) \
1893 { \
1894  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1895  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1896 }
1897 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1898 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1899 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1900 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1901 
1902 /* Description : Pack even halfword elements of vector pairs
1903  Arguments : Inputs - in0, in1, in2, in3
1904  Outputs - out0, out1
1905  Return Type - as per RTYPE
1906  Details : Even halfword elements of in0 are copied to the left half of
1907  out0 & even halfword elements of in1 are copied to the right
1908  half of out0.
1909  Even halfword elements of in2 are copied to the left half of
1910  out1 & even halfword elements of in3 are copied to the right
1911  half of out1.
1912 */
1913 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1914 { \
1915  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1916  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1917 }
1918 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1919 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1920 
1921 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1922  out0, out1, out2, out3) \
1923 { \
1924  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1925  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1926 }
1927 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1928 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1929 
1930 /* Description : Pack even double word elements of vector pairs
1931  Arguments : Inputs - in0, in1, in2, in3
1932  Outputs - out0, out1
1933  Return Type - unsigned byte
1934  Details : Even double elements of in0 are copied to the left half of
1935  out0 & even double elements of in1 are copied to the right
1936  half of out0.
1937  Even double elements of in2 are copied to the left half of
1938  out1 & even double elements of in3 are copied to the right
1939  half of out1.
1940 */
1941 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1942 { \
1943  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1944  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1945 }
1946 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1947 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1948 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1949 
1950 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1951  out0, out1, out2, out3) \
1952 { \
1953  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1954  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1955 }
1956 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1957 
1958 /* Description : Pack odd double word elements of vector pairs
1959  Arguments : Inputs - in0, in1
1960  Outputs - out0, out1
1961  Return Type - as per RTYPE
1962  Details : As operation is on same input 'in0' vector, index 1 double word
1963  element is overwritten to index 0 and result is written to out0
1964  As operation is on same input 'in1' vector, index 1 double word
1965  element is overwritten to index 0 and result is written to out1
1966 */
1967 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1968 { \
1969  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1970  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1971 }
1972 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1973 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1974 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1975 
1976 /* Description : Each byte element is logically xor'ed with immediate 128
1977  Arguments : Inputs - in0, in1
1978  Outputs - in0, in1 (in-place)
1979  Return Type - as per RTYPE
1980  Details : Each unsigned byte element from input vector 'in0' is
1981  logically xor'ed with 128 and result is in-place stored in
1982  'in0' vector
1983  Each unsigned byte element from input vector 'in1' is
1984  logically xor'ed with 128 and result is in-place stored in
1985  'in1' vector
1986  Similar for other pairs
1987 */
1988 #define XORI_B2_128(RTYPE, in0, in1) \
1989 { \
1990  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1991  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1992 }
1993 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1994 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1995 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1996 
1997 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1998 { \
1999  XORI_B2_128(RTYPE, in0, in1); \
2000  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
2001 }
2002 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
2003 
2004 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
2005 { \
2006  XORI_B2_128(RTYPE, in0, in1); \
2007  XORI_B2_128(RTYPE, in2, in3); \
2008 }
2009 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
2010 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
2011 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
2012 
2013 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
2014 { \
2015  XORI_B3_128(RTYPE, in0, in1, in2); \
2016  XORI_B2_128(RTYPE, in3, in4); \
2017 }
2018 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
2019 
2020 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
2021 { \
2022  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2023  XORI_B2_128(RTYPE, in4, in5); \
2024 }
2025 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
2026 
2027 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
2028 { \
2029  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2030  XORI_B3_128(RTYPE, in4, in5, in6); \
2031 }
2032 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
2033 
2034 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
2035 { \
2036  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2037  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
2038 }
2039 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
2040 
2041 /* Description : Addition of signed halfword elements and signed saturation
2042  Arguments : Inputs - in0, in1, in2, in3
2043  Outputs - out0, out1
2044  Return Type - as per RTYPE
2045  Details : Signed halfword elements from 'in0' are added to signed
2046  halfword elements of 'in1'. The result is then signed saturated
2047  between -32768 to +32767 (as per halfword data type)
2048  Similar for other pairs
2049 */
2050 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
2051 { \
2052  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
2053  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
2054 }
2055 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2056 
2057 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2058  out0, out1, out2, out3) \
2059 { \
2060  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
2061  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
2062 }
2063 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2064 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2065 
2066 /* Description : Shift left all elements of vector (generic for all data types)
2067  Arguments : Inputs - in0, in1, in2, in3, shift
2068  Outputs - in0, in1, in2, in3 (in place)
2069  Return Type - as per input vector RTYPE
2070  Details : Each element of vector 'in0' is left shifted by 'shift' and
2071  result is in place written to 'in0'
2072  Similar for other pairs
2073 */
2074 #define SLLI_4V(in0, in1, in2, in3, shift) \
2075 { \
2076  in0 = in0 << shift; \
2077  in1 = in1 << shift; \
2078  in2 = in2 << shift; \
2079  in3 = in3 << shift; \
2080 }
2081 
2082 /* Description : Arithmetic shift right all elements of vector
2083  (generic for all data types)
2084  Arguments : Inputs - in0, in1, in2, in3, shift
2085  Outputs - in0, in1, in2, in3 (in place)
2086  Return Type - as per input vector RTYPE
2087  Details : Each element of vector 'in0' is right shifted by 'shift' and
2088  result is in place written to 'in0'
2089  Here, 'shift' is GP variable passed in
2090  Similar for other pairs
2091 */
2092 #define SRA_4V(in0, in1, in2, in3, shift) \
2093 { \
2094  in0 = in0 >> shift; \
2095  in1 = in1 >> shift; \
2096  in2 = in2 >> shift; \
2097  in3 = in3 >> shift; \
2098 }
2099 
2100 /* Description : Shift right logical all halfword elements of vector
2101  Arguments : Inputs - in0, in1, in2, in3, shift
2102  Outputs - in0, in1, in2, in3 (in place)
2103  Return Type - unsigned halfword
2104  Details : Each element of vector 'in0' is shifted right logical by
2105  number of bits respective element holds in vector 'shift' and
2106  result is in place written to 'in0'
2107  Here, 'shift' is a vector passed in
2108  Similar for other pairs
2109 */
2110 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2111 { \
2112  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2113  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2114  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2115  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2116 }
2117 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2118 
2119 /* Description : Shift right arithmetic rounded halfwords
2120  Arguments : Inputs - in0, in1, shift
2121  Outputs - in0, in1, (in place)
2122  Return Type - unsigned halfword
2123  Details : Each element of vector 'in0' is shifted right arithmetic by
2124  number of bits respective element holds in vector 'shift'.
2125  The last discarded bit is added to shifted value for rounding
2126  and the result is in place written to 'in0'
2127  Here, 'shift' is a vector passed in
2128  Similar for other pairs
2129 */
2130 #define SRAR_H2(RTYPE, in0, in1, shift) \
2131 { \
2132  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2133  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2134 }
2135 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2136 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2137 
2138 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2139 { \
2140  SRAR_H2(RTYPE, in0, in1, shift) \
2141  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2142 }
2143 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2144 
2145 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2146 { \
2147  SRAR_H2(RTYPE, in0, in1, shift) \
2148  SRAR_H2(RTYPE, in2, in3, shift) \
2149 }
2150 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2151 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2152 
2153 /* Description : Shift right arithmetic rounded words
2154  Arguments : Inputs - in0, in1, shift
2155  Outputs - in0, in1, (in place)
2156  Return Type - as per RTYPE
2157  Details : Each element of vector 'in0' is shifted right arithmetic by
2158  number of bits respective element holds in vector 'shift'.
2159  The last discarded bit is added to shifted value for rounding
2160  and the result is in place written to 'in0'
2161  Here, 'shift' is a vector passed in
2162  Similar for other pairs
2163 */
2164 #define SRAR_W2(RTYPE, in0, in1, shift) \
2165 { \
2166  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2167  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2168 }
2169 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2170 
2171 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2172 { \
2173  SRAR_W2(RTYPE, in0, in1, shift) \
2174  SRAR_W2(RTYPE, in2, in3, shift) \
2175 }
2176 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2177 
2178 /* Description : Shift right arithmetic rounded (immediate)
2179  Arguments : Inputs - in0, in1, in2, in3, shift
2180  Outputs - in0, in1, in2, in3 (in place)
2181  Return Type - as per RTYPE
2182  Details : Each element of vector 'in0' is shifted right arithmetic by
2183  value in 'shift'.
2184  The last discarded bit is added to shifted value for rounding
2185  and the result is in place written to 'in0'
2186  Similar for other pairs
2187 */
2188 #define SRARI_H2(RTYPE, in0, in1, shift) \
2189 { \
2190  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2191  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2192 }
2193 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2194 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2195 
2196 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2197 { \
2198  SRARI_H2(RTYPE, in0, in1, shift); \
2199  SRARI_H2(RTYPE, in2, in3, shift); \
2200 }
2201 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2202 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2203 
2204 /* Description : Shift right arithmetic rounded (immediate)
2205  Arguments : Inputs - in0, in1, shift
2206  Outputs - in0, in1 (in place)
2207  Return Type - as per RTYPE
2208  Details : Each element of vector 'in0' is shifted right arithmetic by
2209  value in 'shift'.
2210  The last discarded bit is added to shifted value for rounding
2211  and the result is in place written to 'in0'
2212  Similar for other pairs
2213 */
2214 #define SRARI_W2(RTYPE, in0, in1, shift) \
2215 { \
2216  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2217  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2218 }
2219 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2220 
2221 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2222 { \
2223  SRARI_W2(RTYPE, in0, in1, shift); \
2224  SRARI_W2(RTYPE, in2, in3, shift); \
2225 }
2226 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2227 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2228 
2229 /* Description : Multiplication of pairs of vectors
2230  Arguments : Inputs - in0, in1, in2, in3
2231  Outputs - out0, out1
2232  Details : Each element from 'in0' is multiplied with elements from 'in1'
2233  and result is written to 'out0'
2234  Similar for other pairs
2235 */
2236 #define MUL2(in0, in1, in2, in3, out0, out1) \
2237 { \
2238  out0 = in0 * in1; \
2239  out1 = in2 * in3; \
2240 }
2241 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2242 { \
2243  MUL2(in0, in1, in2, in3, out0, out1); \
2244  MUL2(in4, in5, in6, in7, out2, out3); \
2245 }
2246 
2247 /* Description : Addition of 2 pairs of vectors
2248  Arguments : Inputs - in0, in1, in2, in3
2249  Outputs - out0, out1
2250  Details : Each element from 2 pairs vectors is added and 2 results are
2251  produced
2252 */
2253 #define ADD2(in0, in1, in2, in3, out0, out1) \
2254 { \
2255  out0 = in0 + in1; \
2256  out1 = in2 + in3; \
2257 }
2258 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2259 { \
2260  ADD2(in0, in1, in2, in3, out0, out1); \
2261  ADD2(in4, in5, in6, in7, out2, out3); \
2262 }
2263 
2264 /* Description : Subtraction of 2 pairs of vectors
2265  Arguments : Inputs - in0, in1, in2, in3
2266  Outputs - out0, out1
2267  Details : Each element from 2 pairs vectors is subtracted and 2 results
2268  are produced
2269 */
2270 #define SUB2(in0, in1, in2, in3, out0, out1) \
2271 { \
2272  out0 = in0 - in1; \
2273  out1 = in2 - in3; \
2274 }
2275 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2276 { \
2277  out0 = in0 - in1; \
2278  out1 = in2 - in3; \
2279  out2 = in4 - in5; \
2280  out3 = in6 - in7; \
2281 }
2282 
2283 /* Description : Sign extend halfword elements from right half of the vector
2284  Arguments : Inputs - in (input halfword vector)
2285  Outputs - out (sign extended word vectors)
2286  Return Type - signed word
2287  Details : Sign bit of halfword elements from input vector 'in' is
2288  extracted and interleaved with same vector 'in0' to generate
2289  4 word elements keeping sign intact
2290 */
2291 #define UNPCK_R_SH_SW(in, out) \
2292 { \
2293  v8i16 sign_m; \
2294  \
2295  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2296  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2297 }
2298 
2299 /* Description : Sign extend byte elements from input vector and return
2300  halfword results in pair of vectors
2301  Arguments : Inputs - in (1 input byte vector)
2302  Outputs - out0, out1 (sign extended 2 halfword vectors)
2303  Return Type - signed halfword
2304  Details : Sign bit of byte elements from input vector 'in' is
2305  extracted and interleaved right with same vector 'in0' to
2306  generate 8 signed halfword elements in 'out0'
2307  Then interleaved left with same vector 'in0' to
2308  generate 8 signed halfword elements in 'out1'
2309 */
2310 #define UNPCK_SB_SH(in, out0, out1) \
2311 { \
2312  v16i8 tmp_m; \
2313  \
2314  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2315  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2316 }
2317 
2318 /* Description : Zero extend unsigned byte elements to halfword elements
2319  Arguments : Inputs - in (1 input unsigned byte vector)
2320  Outputs - out0, out1 (unsigned 2 halfword vectors)
2321  Return Type - signed halfword
2322  Details : Zero extended right half of vector is returned in 'out0'
2323  Zero extended left half of vector is returned in 'out1'
2324 */
2325 #define UNPCK_UB_SH(in, out0, out1) \
2326 { \
2327  v16i8 zero_m = { 0 }; \
2328  \
2329  ILVRL_B2_SH(zero_m, in, out0, out1); \
2330 }
2331 
2332 /* Description : Sign extend halfword elements from input vector and return
2333  result in pair of vectors
2334  Arguments : Inputs - in (1 input halfword vector)
2335  Outputs - out0, out1 (sign extended 2 word vectors)
2336  Return Type - signed word
2337  Details : Sign bit of halfword elements from input vector 'in' is
2338  extracted and interleaved right with same vector 'in0' to
2339  generate 4 signed word elements in 'out0'
2340  Then interleaved left with same vector 'in0' to
2341  generate 4 signed word elements in 'out1'
2342 */
2343 #define UNPCK_SH_SW(in, out0, out1) \
2344 { \
2345  v8i16 tmp_m; \
2346  \
2347  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2348  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2349 }
2350 
2351 /* Description : Swap two variables
2352  Arguments : Inputs - in0, in1
2353  Outputs - in0, in1 (in-place)
2354  Details : Swapping of two input variables using xor
2355 */
2356 #define SWAP(in0, in1) \
2357 { \
2358  in0 = in0 ^ in1; \
2359  in1 = in0 ^ in1; \
2360  in0 = in0 ^ in1; \
2361 }
2362 
2363 /* Description : Butterfly of 4 input vectors
2364  Arguments : Inputs - in0, in1, in2, in3
2365  Outputs - out0, out1, out2, out3
2366  Details : Butterfly operation
2367 */
2368 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2369 { \
2370  out0 = in0 + in3; \
2371  out1 = in1 + in2; \
2372  \
2373  out2 = in1 - in2; \
2374  out3 = in0 - in3; \
2375 }
2376 
2377 /* Description : Butterfly of 8 input vectors
2378  Arguments : Inputs - in0 ... in7
2379  Outputs - out0 .. out7
2380  Details : Butterfly operation
2381 */
2382 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2383  out0, out1, out2, out3, out4, out5, out6, out7) \
2384 { \
2385  out0 = in0 + in7; \
2386  out1 = in1 + in6; \
2387  out2 = in2 + in5; \
2388  out3 = in3 + in4; \
2389  \
2390  out4 = in3 - in4; \
2391  out5 = in2 - in5; \
2392  out6 = in1 - in6; \
2393  out7 = in0 - in7; \
2394 }
2395 
2396 /* Description : Butterfly of 16 input vectors
2397  Arguments : Inputs - in0 ... in15
2398  Outputs - out0 .. out15
2399  Details : Butterfly operation
2400 */
2401 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2402  in8, in9, in10, in11, in12, in13, in14, in15, \
2403  out0, out1, out2, out3, out4, out5, out6, out7, \
2404  out8, out9, out10, out11, out12, out13, out14, out15) \
2405 { \
2406  out0 = in0 + in15; \
2407  out1 = in1 + in14; \
2408  out2 = in2 + in13; \
2409  out3 = in3 + in12; \
2410  out4 = in4 + in11; \
2411  out5 = in5 + in10; \
2412  out6 = in6 + in9; \
2413  out7 = in7 + in8; \
2414  \
2415  out8 = in7 - in8; \
2416  out9 = in6 - in9; \
2417  out10 = in5 - in10; \
2418  out11 = in4 - in11; \
2419  out12 = in3 - in12; \
2420  out13 = in2 - in13; \
2421  out14 = in1 - in14; \
2422  out15 = in0 - in15; \
2423 }
2424 
2425 /* Description : Transposes input 4x4 byte block
2426  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2427  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2428  Return Type - unsigned byte
2429  Details :
2430 */
2431 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2432 { \
2433  v16i8 zero_m = { 0 }; \
2434  v16i8 s0_m, s1_m, s2_m, s3_m; \
2435  \
2436  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2437  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2438  \
2439  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2440  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2441  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2442  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2443 }
2444 
2445 /* Description : Transposes input 8x4 byte block into 4x8
2446  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2447  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2448  Return Type - unsigned byte
2449  Details :
2450 */
2451 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2452  out0, out1, out2, out3) \
2453 { \
2454  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2455  \
2456  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2457  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2458  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2459  \
2460  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2461  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2462  \
2463  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2464  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2465  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2466 }
2467 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2468 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2469 
2470 /* Description : Transposes input 8x8 byte block
2471  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2472  (input 8x8 byte block)
2473  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2474  (output 8x8 byte block)
2475  Return Type - unsigned byte
2476  Details :
2477 */
2478 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2479  out0, out1, out2, out3, out4, out5, out6, out7) \
2480 { \
2481  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2482  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2483  \
2484  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2485  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2486  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2487  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2488  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2489  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2490  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2491  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2492 }
2493 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2494 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2495 
2496 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2497  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2498  in8, in9, in10, in11, in12, in13, in14, in15
2499  Outputs - out0, out1, out2, out3
2500  Return Type - unsigned byte
2501  Details :
2502 */
2503 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2504  in8, in9, in10, in11, in12, in13, in14, in15, \
2505  out0, out1, out2, out3) \
2506 { \
2507  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2508  \
2509  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2510  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2511  \
2512  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2513  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2514  \
2515  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2516  \
2517  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2518  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2519  \
2520  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2521  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2522  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2523  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2524  \
2525  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2526  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2527  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2528  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2529 }
2530 
2531 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2532  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2533  in8, in9, in10, in11, in12, in13, in14, in15
2534  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2535  Return Type - unsigned byte
2536  Details :
2537 */
2538 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2539  in8, in9, in10, in11, in12, in13, in14, in15, \
2540  out0, out1, out2, out3, out4, out5, out6, out7) \
2541 { \
2542  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2543  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2544  \
2545  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2546  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2547  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2548  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2549  \
2550  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2551  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2552  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2553  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2554  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2555  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2556  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2557  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2558  \
2559  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2560  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2561  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2562  \
2563  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2564  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2565  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2566  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2567  \
2568  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2569  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2570  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2571  \
2572  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2573  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2574  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2575  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2576  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2577  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2578 }
2579 
2580 /* Description : Transposes 4x4 block with half word elements in vectors
2581  Arguments : Inputs - in0, in1, in2, in3
2582  Outputs - out0, out1, out2, out3
2583  Return Type - signed halfword
2584  Details :
2585 */
2586 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2587 { \
2588  v8i16 s0_m, s1_m; \
2589  \
2590  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2591  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2592  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2593  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2594 }
2595 
2596 /* Description : Transposes 8x8 block with half word elements in vectors
2597  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2598  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2599  Return Type - signed halfword
2600  Details :
2601 */
2602 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2603  out0, out1, out2, out3, out4, out5, out6, out7) \
2604 { \
2605  v8i16 s0_m, s1_m; \
2606  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2607  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2608  \
2609  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2610  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2611  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2612  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2613  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2614  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2615  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2616  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2617  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2618  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2619  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2620  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2621  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2622  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2623 }
2624 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2625 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2626 
2627 /* Description : Transposes 4x4 block with word elements in vectors
2628  Arguments : Inputs - in0, in1, in2, in3
2629  Outputs - out0, out1, out2, out3
2630  Return Type - signed word
2631  Details :
2632 */
2633 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2634 { \
2635  v4i32 s0_m, s1_m, s2_m, s3_m; \
2636  \
2637  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2638  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2639  \
2640  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2641  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2642  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2643  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2644 }
2645 
2646 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2647  block in destination memory
2648  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2649  Outputs -
2650  Return Type -
2651  Details : Each byte element from input vector pair 'in0' and 'in1' are
2652  averaged (a + b)/2 and stored in 'tmp0_m'
2653  Each byte element from input vector pair 'in2' and 'in3' are
2654  averaged (a + b)/2 and stored in 'tmp1_m'
2655  Each byte element from input vector pair 'in4' and 'in5' are
2656  averaged (a + b)/2 and stored in 'tmp2_m'
2657  Each byte element from input vector pair 'in6' and 'in7' are
2658  averaged (a + b)/2 and stored in 'tmp3_m'
2659  The half vector results from all 4 vectors are stored in
2660  destination memory as 8x4 byte block
2661 */
2662 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2663 { \
2664  uint64_t out0_m, out1_m, out2_m, out3_m; \
2665  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2666  \
2667  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2668  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2669  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2670  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2671  \
2672  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2673  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2674  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2675  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2676  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2677 }
2678 
2679 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2680  block in destination memory
2681  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2682  Outputs -
2683  Return Type -
2684  Details : Each byte element from input vector pair 'in0' and 'in1' are
2685  averaged (a + b)/2 and stored in 'tmp0_m'
2686  Each byte element from input vector pair 'in2' and 'in3' are
2687  averaged (a + b)/2 and stored in 'tmp1_m'
2688  Each byte element from input vector pair 'in4' and 'in5' are
2689  averaged (a + b)/2 and stored in 'tmp2_m'
2690  Each byte element from input vector pair 'in6' and 'in7' are
2691  averaged (a + b)/2 and stored in 'tmp3_m'
2692  The results from all 4 vectors are stored in destination
2693  memory as 16x4 byte block
2694 */
2695 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2696 { \
2697  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2698  \
2699  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2700  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2701  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2702  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2703  \
2704  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2705 }
2706 
2707 /* Description : Average rounded byte elements from pair of vectors and store
2708  8x4 byte block in destination memory
2709  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2710  Outputs -
2711  Return Type -
2712  Details : Each byte element from input vector pair 'in0' and 'in1' are
2713  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2714  Each byte element from input vector pair 'in2' and 'in3' are
2715  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2716  Each byte element from input vector pair 'in4' and 'in5' are
2717  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2718  Each byte element from input vector pair 'in6' and 'in7' are
2719  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2720  The half vector results from all 4 vectors are stored in
2721  destination memory as 8x4 byte block
2722 */
2723 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2724 { \
2725  uint64_t out0_m, out1_m, out2_m, out3_m; \
2726  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2727  \
2728  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2729  tp0_m, tp1_m, tp2_m, tp3_m); \
2730  \
2731  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2732  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2733  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2734  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2735  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2736 }
2737 
2738 /* Description : Average rounded byte elements from pair of vectors and store
2739  16x4 byte block in destination memory
2740  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2741  Outputs -
2742  Return Type -
2743  Details : Each byte element from input vector pair 'in0' and 'in1' are
2744  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2745  Each byte element from input vector pair 'in2' and 'in3' are
2746  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2747  Each byte element from input vector pair 'in4' and 'in5' are
2748  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2749  Each byte element from input vector pair 'in6' and 'in7' are
2750  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2751  The vector results from all 4 vectors are stored in
2752  destination memory as 16x4 byte block
2753 */
2754 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2755 { \
2756  v16u8 t0_m, t1_m, t2_m, t3_m; \
2757  \
2758  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2759  t0_m, t1_m, t2_m, t3_m); \
2760  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2761 }
2762 
2763 /* Description : Average rounded byte elements from pair of vectors,
2764  average rounded with destination and store 8x4 byte block
2765  in destination memory
2766  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2767  Outputs -
2768  Return Type -
2769  Details : Each byte element from input vector pair 'in0' and 'in1' are
2770  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2771  Each byte element from input vector pair 'in2' and 'in3' are
2772  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2773  Each byte element from input vector pair 'in4' and 'in5' are
2774  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2775  Each byte element from input vector pair 'in6' and 'in7' are
2776  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2777  The half vector results from all 4 vectors are stored in
2778  destination memory as 8x4 byte block
2779 */
2780 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2781  pdst, stride) \
2782 { \
2783  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2784  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2785  \
2786  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2787  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2788  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2789  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2790  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2791 }
2792 
2793 /* Description : Average rounded byte elements from pair of vectors,
2794  average rounded with destination and store 16x4 byte block
2795  in destination memory
2796  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2797  Outputs -
2798  Return Type -
2799  Details : Each byte element from input vector pair 'in0' and 'in1' are
2800  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2801  Each byte element from input vector pair 'in2' and 'in3' are
2802  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2803  Each byte element from input vector pair 'in4' and 'in5' are
2804  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2805  Each byte element from input vector pair 'in6' and 'in7' are
2806  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2807  The vector results from all 4 vectors are stored in
2808  destination memory as 16x4 byte block
2809 */
2810 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2811  pdst, stride) \
2812 { \
2813  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2814  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2815  \
2816  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2817  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2818  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2819  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2820  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2821 }
2822 
2823 /* Description : Add block 4x4
2824  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2825  Outputs -
2826  Return Type - unsigned bytes
2827  Details : Least significant 4 bytes from each input vector are added to
2828  the destination bytes, clipped between 0-255 and then stored.
2829 */
2830 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2831 { \
2832  uint32_t src0_m, src1_m, src2_m, src3_m; \
2833  uint32_t out0_m, out1_m, out2_m, out3_m; \
2834  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2835  v16i8 dst0_m = { 0 }; \
2836  v16i8 dst1_m = { 0 }; \
2837  v16i8 zero_m = { 0 }; \
2838  \
2839  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2840  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2841  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2842  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2843  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2844  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2845  CLIP_SH2_0_255(res0_m, res1_m); \
2846  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2847  \
2848  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2849  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2850  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2851  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2852  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2853 }
2854 
2855 /* Description : Dot product and addition of 3 signed halfword input vectors
2856  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2857  Outputs - out0_m
2858  Return Type - signed halfword
2859  Details : Dot product of 'in0' with 'coeff0'
2860  Dot product of 'in1' with 'coeff1'
2861  Dot product of 'in2' with 'coeff2'
2862  Addition of all the 3 vector results
2863 
2864  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2865 */
2866 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2867 ( { \
2868  v8i16 tmp1_m; \
2869  v8i16 out0_m; \
2870  \
2871  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2872  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2873  tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2874  out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2875  \
2876  out0_m; \
2877 } )
2878 
2879 /* Description : Pack even elements of input vectors & xor with 128
2880  Arguments : Inputs - in0, in1
2881  Outputs - out_m
2882  Return Type - unsigned byte
2883  Details : Signed byte even elements from 'in0' and 'in1' are packed
2884  together in one vector and the resulted vector is xor'ed with
2885  128 to shift the range from signed to unsigned byte
2886 */
2887 #define PCKEV_XORI128_UB(in0, in1) \
2888 ( { \
2889  v16u8 out_m; \
2890  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2891  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2892  out_m; \
2893 } )
2894 
2895 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2896  as 8x4 unsigned byte block
2897  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2898  pdst, stride
2899 */
2900 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2901  dst0, dst1, dst2, dst3, pdst, stride) \
2902 { \
2903  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2904  uint8_t *pdst_m = (uint8_t *) (pdst); \
2905  \
2906  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2907  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2908  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
2909  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
2910  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2911 }
2912 
2913 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2914  of results and store 4 words in destination memory as per
2915  stride
2916  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2917 */
2918 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2919 { \
2920  uint32_t out0_m, out1_m, out2_m, out3_m; \
2921  v16i8 tmp0_m, tmp1_m; \
2922  \
2923  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2924  \
2925  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2926  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2927  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2928  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2929  \
2930  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2931 }
2932 
2933 /* Description : Pack even byte elements and store byte vector in destination
2934  memory
2935  Arguments : Inputs - in0, in1, pdst
2936 */
2937 #define PCKEV_ST_SB(in0, in1, pdst) \
2938 { \
2939  v16i8 tmp_m; \
2940  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2941  ST_SB(tmp_m, (pdst)); \
2942 }
2943 
2944 /* Description : Horizontal 2 tap filter kernel code
2945  Arguments : Inputs - in0, in1, mask, coeff, shift
2946 */
2947 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2948 ( { \
2949  v16i8 tmp0_m; \
2950  v8u16 tmp1_m; \
2951  \
2952  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2953  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2954  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2955  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2956  \
2957  tmp1_m; \
2958 } )
2959 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */