FFmpeg
hpeldsp_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
23 #include "hpeldsp_lasx.h"
24 
25 static av_always_inline void
26 put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
27  int dst_stride, int src_stride1, int src_stride2, int h)
28 {
29  int stride1_2, stride1_3, stride1_4;
30  int stride2_2, stride2_3, stride2_4;
31  __asm__ volatile (
32  "slli.d %[stride1_2], %[srcStride1], 1 \n\t"
33  "slli.d %[stride2_2], %[srcStride2], 1 \n\t"
34  "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t"
35  "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t"
36  "slli.d %[stride1_4], %[stride1_2], 1 \n\t"
37  "slli.d %[stride2_4], %[stride2_2], 1 \n\t"
38  "1: \n\t"
39  "vld $vr0, %[src1], 0 \n\t"
40  "vldx $vr1, %[src1], %[srcStride1] \n\t"
41  "vldx $vr2, %[src1], %[stride1_2] \n\t"
42  "vldx $vr3, %[src1], %[stride1_3] \n\t"
43  "add.d %[src1], %[src1], %[stride1_4] \n\t"
44 
45  "vld $vr4, %[src2], 0 \n\t"
46  "vldx $vr5, %[src2], %[srcStride2] \n\t"
47  "vldx $vr6, %[src2], %[stride2_2] \n\t"
48  "vldx $vr7, %[src2], %[stride2_3] \n\t"
49  "add.d %[src2], %[src2], %[stride2_4] \n\t"
50 
51  "addi.d %[h], %[h], -4 \n\t"
52 
53  "vavgr.bu $vr0, $vr4, $vr0 \n\t"
54  "vavgr.bu $vr1, $vr5, $vr1 \n\t"
55  "vavgr.bu $vr2, $vr6, $vr2 \n\t"
56  "vavgr.bu $vr3, $vr7, $vr3 \n\t"
57  "vstelm.d $vr0, %[dst], 0, 0 \n\t"
58  "add.d %[dst], %[dst], %[dstStride] \n\t"
59  "vstelm.d $vr1, %[dst], 0, 0 \n\t"
60  "add.d %[dst], %[dst], %[dstStride] \n\t"
61  "vstelm.d $vr2, %[dst], 0, 0 \n\t"
62  "add.d %[dst], %[dst], %[dstStride] \n\t"
63  "vstelm.d $vr3, %[dst], 0, 0 \n\t"
64  "add.d %[dst], %[dst], %[dstStride] \n\t"
65  "bnez %[h], 1b \n\t"
66 
67  : [dst]"+&r"(dst), [src2]"+&r"(src2), [src1]"+&r"(src1),
68  [h]"+&r"(h), [stride1_2]"=&r"(stride1_2),
69  [stride1_3]"=&r"(stride1_3), [stride1_4]"=&r"(stride1_4),
70  [stride2_2]"=&r"(stride2_2), [stride2_3]"=&r"(stride2_3),
71  [stride2_4]"=&r"(stride2_4)
72  : [dstStride]"r"(dst_stride), [srcStride1]"r"(src_stride1),
73  [srcStride2]"r"(src_stride2)
74  : "memory"
75  );
76 }
77 
78 static av_always_inline void
79 put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
80  int dst_stride, int src_stride1, int src_stride2, int h)
81 {
82  int stride1_2, stride1_3, stride1_4;
83  int stride2_2, stride2_3, stride2_4;
84  int dststride2, dststride3, dststride4;
85  __asm__ volatile (
86  "slli.d %[stride1_2], %[srcStride1], 1 \n\t"
87  "slli.d %[stride2_2], %[srcStride2], 1 \n\t"
88  "slli.d %[dststride2], %[dstStride], 1 \n\t"
89  "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t"
90  "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t"
91  "add.d %[dststride3], %[dststride2], %[dstStride] \n\t"
92  "slli.d %[stride1_4], %[stride1_2], 1 \n\t"
93  "slli.d %[stride2_4], %[stride2_2], 1 \n\t"
94  "slli.d %[dststride4], %[dststride2], 1 \n\t"
95  "1: \n\t"
96  "vld $vr0, %[src1], 0 \n\t"
97  "vldx $vr1, %[src1], %[srcStride1] \n\t"
98  "vldx $vr2, %[src1], %[stride1_2] \n\t"
99  "vldx $vr3, %[src1], %[stride1_3] \n\t"
100  "add.d %[src1], %[src1], %[stride1_4] \n\t"
101 
102  "vld $vr4, %[src2], 0 \n\t"
103  "vldx $vr5, %[src2], %[srcStride2] \n\t"
104  "vldx $vr6, %[src2], %[stride2_2] \n\t"
105  "vldx $vr7, %[src2], %[stride2_3] \n\t"
106  "add.d %[src2], %[src2], %[stride2_4] \n\t"
107 
108  "addi.d %[h], %[h], -4 \n\t"
109 
110  "vavgr.bu $vr0, $vr4, $vr0 \n\t"
111  "vavgr.bu $vr1, $vr5, $vr1 \n\t"
112  "vavgr.bu $vr2, $vr6, $vr2 \n\t"
113  "vavgr.bu $vr3, $vr7, $vr3 \n\t"
114  "vst $vr0, %[dst], 0 \n\t"
115  "vstx $vr1, %[dst], %[dstStride] \n\t"
116  "vstx $vr2, %[dst], %[dststride2] \n\t"
117  "vstx $vr3, %[dst], %[dststride3] \n\t"
118  "add.d %[dst], %[dst], %[dststride4] \n\t"
119  "bnez %[h], 1b \n\t"
120 
121  : [dst]"+&r"(dst), [src2]"+&r"(src2), [src1]"+&r"(src1),
122  [h]"+&r"(h), [stride1_2]"=&r"(stride1_2),
123  [stride1_3]"=&r"(stride1_3), [stride1_4]"=&r"(stride1_4),
124  [stride2_2]"=&r"(stride2_2), [stride2_3]"=&r"(stride2_3),
125  [stride2_4]"=&r"(stride2_4), [dststride2]"=&r"(dststride2),
126  [dststride3]"=&r"(dststride3), [dststride4]"=&r"(dststride4)
127  : [dstStride]"r"(dst_stride), [srcStride1]"r"(src_stride1),
128  [srcStride2]"r"(src_stride2)
129  : "memory"
130  );
131 }
132 
133 void ff_put_pixels8_8_lasx(uint8_t *block, const uint8_t *pixels,
134  ptrdiff_t line_size, int h)
135 {
136  uint64_t tmp[8];
137  int h_8 = h >> 3;
138  int res = h & 7;
139  ptrdiff_t stride2, stride3, stride4;
140 
141  __asm__ volatile (
142  "beqz %[h_8], 2f \n\t"
143  "slli.d %[stride2], %[stride], 1 \n\t"
144  "add.d %[stride3], %[stride2], %[stride] \n\t"
145  "slli.d %[stride4], %[stride2], 1 \n\t"
146  "1: \n\t"
147  "ld.d %[tmp0], %[src], 0x0 \n\t"
148  "ldx.d %[tmp1], %[src], %[stride] \n\t"
149  "ldx.d %[tmp2], %[src], %[stride2] \n\t"
150  "ldx.d %[tmp3], %[src], %[stride3] \n\t"
151  "add.d %[src], %[src], %[stride4] \n\t"
152  "ld.d %[tmp4], %[src], 0x0 \n\t"
153  "ldx.d %[tmp5], %[src], %[stride] \n\t"
154  "ldx.d %[tmp6], %[src], %[stride2] \n\t"
155  "ldx.d %[tmp7], %[src], %[stride3] \n\t"
156  "add.d %[src], %[src], %[stride4] \n\t"
157 
158  "addi.d %[h_8], %[h_8], -1 \n\t"
159 
160  "st.d %[tmp0], %[dst], 0x0 \n\t"
161  "stx.d %[tmp1], %[dst], %[stride] \n\t"
162  "stx.d %[tmp2], %[dst], %[stride2] \n\t"
163  "stx.d %[tmp3], %[dst], %[stride3] \n\t"
164  "add.d %[dst], %[dst], %[stride4] \n\t"
165  "st.d %[tmp4], %[dst], 0x0 \n\t"
166  "stx.d %[tmp5], %[dst], %[stride] \n\t"
167  "stx.d %[tmp6], %[dst], %[stride2] \n\t"
168  "stx.d %[tmp7], %[dst], %[stride3] \n\t"
169  "add.d %[dst], %[dst], %[stride4] \n\t"
170  "bnez %[h_8], 1b \n\t"
171 
172  "2: \n\t"
173  "beqz %[res], 4f \n\t"
174  "3: \n\t"
175  "ld.d %[tmp0], %[src], 0x0 \n\t"
176  "add.d %[src], %[src], %[stride] \n\t"
177  "addi.d %[res], %[res], -1 \n\t"
178  "st.d %[tmp0], %[dst], 0x0 \n\t"
179  "add.d %[dst], %[dst], %[stride] \n\t"
180  "bnez %[res], 3b \n\t"
181  "4: \n\t"
182  : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
183  [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
184  [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
185  [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]),
186  [dst]"+&r"(block), [src]"+&r"(pixels),
187  [h_8]"+&r"(h_8), [res]"+&r"(res),
188  [stride2]"=&r"(stride2), [stride3]"=&r"(stride3),
189  [stride4]"=&r"(stride4)
190  : [stride]"r"(line_size)
191  : "memory"
192  );
193 }
194 
195 /**
196  * For widths 16, h is always a positive multiple of 4.
197  * The function processes 4 rows per iteration.
198  */
199 void ff_put_pixels16_8_lsx(uint8_t *block, const uint8_t *pixels,
200  ptrdiff_t line_size, int h)
201 {
202  int h_4 = h >> 2;
203  ptrdiff_t stride2 = line_size << 1;
204  ptrdiff_t stride3 = stride2 + line_size;
205  ptrdiff_t stride4 = line_size << 2;
206  __m128i src0, src1, src2, src3;
207 
208  for (int i = 0; i < h_4; i++) {
209  src0 = __lsx_vld(pixels, 0);
210  src1 = __lsx_vldx(pixels, line_size);
211  src2 = __lsx_vldx(pixels, stride2);
212  src3 = __lsx_vldx(pixels, stride3);
213 
214  __lsx_vst(src0, block, 0);
215  __lsx_vstx(src1, block, line_size);
216  __lsx_vstx(src2, block, stride2);
217  __lsx_vstx(src3, block, stride3);
218 
219  pixels += stride4;
220  block += stride4;
221  }
222 }
223 
224 void ff_put_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
225  ptrdiff_t line_size, int h)
226 {
227  put_pixels8_l2_8_lsx(block, pixels, pixels + 1, line_size, line_size,
228  line_size, h);
229 }
230 
231 void ff_put_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
232  ptrdiff_t line_size, int h)
233 {
234  put_pixels8_l2_8_lsx(block, pixels, pixels + line_size, line_size,
235  line_size, line_size, h);
236 }
237 
238 void ff_put_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
239  ptrdiff_t line_size, int h)
240 {
241  put_pixels16_l2_8_lsx(block, pixels, pixels + 1, line_size, line_size,
242  line_size, h);
243 }
244 
245 void ff_put_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
246  ptrdiff_t line_size, int h)
247 {
248  put_pixels16_l2_8_lsx(block, pixels, pixels + line_size, line_size,
249  line_size, line_size, h);
250 }
251 
252 /**
253  * For widths 16, h is always a positive multiple of 4.
254  * The function processes 4 rows per iteration.
255  */
256 void ff_put_no_rnd_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
257  ptrdiff_t line_size, int h)
258 {
259  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
260  int32_t h_4 = h >> 2;
261  int32_t stride2x = line_size << 1;
262  int32_t stride4x = line_size << 2;
263  int32_t stride3x = stride2x + line_size;
264  uint8_t* _src = (uint8_t*)pixels + 1;
265 
266  for (int i = 0; i < h_4; i++) {
267  src0 = __lasx_xvld(pixels, 0);
268  DUP2_ARG2(__lasx_xvldx, pixels, line_size, pixels, stride2x, src1, src2);
269  src3 = __lasx_xvldx(pixels, stride3x);
270  src4 = __lasx_xvld(_src, 0);
271  DUP2_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, src5, src6);
272  src7 = __lasx_xvldx(_src, stride3x);
273  DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4,
274  0x20, src7, src6, 0x20, src0, src1, src2, src3);
275  src0 = __lasx_xvavg_bu(src0, src2);
276  src1 = __lasx_xvavg_bu(src1, src3);
277  __lasx_xvstelm_d(src0, block, 0, 0);
278  __lasx_xvstelm_d(src0, block, 8, 1);
279  block += line_size;
280  __lasx_xvstelm_d(src0, block, 0, 2);
281  __lasx_xvstelm_d(src0, block, 8, 3);
282  block += line_size;
283  __lasx_xvstelm_d(src1, block, 0, 0);
284  __lasx_xvstelm_d(src1, block, 8, 1);
285  block += line_size;
286  __lasx_xvstelm_d(src1, block, 0, 2);
287  __lasx_xvstelm_d(src1, block, 8, 3);
288  block += line_size;
289 
290  _src += stride4x;
291  pixels += stride4x;
292  }
293 }
294 
295 /**
296  * For widths 16, h is always a positive multiple of 4.
297  * The function processes 4 rows per iteration.
298  */
299 void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
300  ptrdiff_t line_size, int h)
301 {
302  __m256i src0, src1, src2, src3, src4;
303  int32_t stride2x = line_size << 1;
304  int32_t stride4x = line_size << 2;
305  int32_t stride3x = stride2x + line_size;
306  uint8_t* _src = (uint8_t*)pixels;
307  int32_t h_4 = h >> 2;
308 
309  for (int i = 0; i < h_4; i++) {
310  src0 = __lasx_xvld(_src, 0);
311  DUP2_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, src1, src2);
312  src3 = __lasx_xvldx(_src, stride3x);
313  _src += stride4x;
314  src4 = __lasx_xvld(_src, 0);
315 
316  DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2,
317  0x20, src4, src3, 0x20, src0, src1, src2, src3);
318  DUP2_ARG2(__lasx_xvavg_bu, src0, src1, src2, src3, src0, src2);
319 
320  __lasx_xvstelm_d(src0, block, 0, 0);
321  __lasx_xvstelm_d(src0, block, 8, 1);
322  block += line_size;
323  __lasx_xvstelm_d(src0, block, 0, 2);
324  __lasx_xvstelm_d(src0, block, 8, 3);
325  block += line_size;
326  __lasx_xvstelm_d(src2, block, 0, 0);
327  __lasx_xvstelm_d(src2, block, 8, 1);
328  block += line_size;
329  __lasx_xvstelm_d(src2, block, 0, 2);
330  __lasx_xvstelm_d(src2, block, 8, 3);
331  block += line_size;
332  }
333 }
334 
336  const uint8_t *pixels,
337  ptrdiff_t line_size, int h)
338 {
339  __m256i src0, src1, src2, src3;
340  __m256i sum0, sum1, sum2;
341  src0 = __lasx_xvld(pixels, 0);
342  src1 = __lasx_xvld(pixels, 1);
343  src2 = __lasx_vext2xv_hu_bu(src0);
344  src3 = __lasx_vext2xv_hu_bu(src1);
345  sum0 = __lasx_xvadd_h(src2, src3);
346  sum0 = __lasx_xvaddi_hu(sum0, 1);
347 
348  for (int i= 0; i < h; i++) {
349  pixels += line_size;
350  src0 = __lasx_xvld(pixels, 0);
351  src1 = __lasx_xvld(pixels, 1);
352 
353  src2 = __lasx_vext2xv_hu_bu(src0);
354  src3 = __lasx_vext2xv_hu_bu(src1);
355  sum1 = __lasx_xvadd_h(src2, src3);
356  sum2 = __lasx_xvadd_h(sum0, sum1);
357  sum2 = __lasx_xvsrani_b_h(sum2, sum2, 2);
358 
359  sum0 = __lasx_xvaddi_hu(sum1, 1);
360  __lasx_xvstelm_d(sum2, block, 0, 0);
361  __lasx_xvstelm_d(sum2, block, 8, 3);
362 
363  block += line_size;
364  }
365 }
366 
367 /**
368  * For widths 8, h is always a positive multiple of 4.
369  * The function processes 4 rows per iteration.
370  */
371 void ff_put_no_rnd_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels,
372  ptrdiff_t line_size, int h)
373 {
374  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
375  int32_t stride2x = line_size << 1;
376  int32_t stride3x = stride2x + line_size;
377  int32_t stride4x = line_size << 2;
378  uint8_t *_src = (uint8_t*)pixels + 1;
379  int32_t h_4 = h >> 2;
380 
381  for (int i = 0; i < h_4; i++) {
382  src0 = __lasx_xvld(pixels, 0);
383  DUP2_ARG2(__lasx_xvldx, pixels, line_size, pixels, stride2x, src1, src2);
384  src3 = __lasx_xvldx(pixels, stride3x);
385  src4 = __lasx_xvld(_src, 0);
386  DUP2_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, src5, src6);
387  src7 = __lasx_xvldx(_src, stride3x);
388  DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src3, src2, src5, src4, src7, src6,
389  src0, src1, src2, src3);
390  DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src1);
391  src0 = __lasx_xvavg_bu(src0, src1);
392  __lasx_xvstelm_d(src0, block, 0, 0);
393  block += line_size;
394  __lasx_xvstelm_d(src0, block, 0, 1);
395  block += line_size;
396  __lasx_xvstelm_d(src0, block, 0, 2);
397  block += line_size;
398  __lasx_xvstelm_d(src0, block, 0, 3);
399  block += line_size;
400 
401  pixels += stride4x;
402  _src += stride4x;
403  }
404 }
405 
406 /**
407  * For widths 8, h is always a positive multiple of 4.
408  * The function processes 4 rows per iteration.
409  */
410 void ff_put_no_rnd_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels,
411  ptrdiff_t line_size, int h)
412 {
413  __m256i src0, src1, src2, src3, src4;
414  int32_t stride2x = line_size << 1;
415  int32_t stride4x = line_size << 2;
416  int32_t stride3x = stride2x + line_size;
417  uint8_t* _src = (uint8_t*)pixels;
418  int32_t h_4 = h >> 2;
419 
420  for (int i = 0; i < h_4; i++) {
421  src0 = __lasx_xvld(_src, 0);
422  DUP4_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, _src,
423  stride3x, _src, stride4x, src1, src2, src3, src4);
424  DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src2, src1, src3, src2, src4, src3,
425  src0, src1, src2, src3);
426  DUP2_ARG3(__lasx_xvpermi_q, src2, src0, 0x20, src3, src1, 0x20, src0, src1);
427  src0 = __lasx_xvavg_bu(src0, src1);
428  __lasx_xvstelm_d(src0, block, 0, 0);
429  block += line_size;
430  __lasx_xvstelm_d(src0, block, 0, 1);
431  block += line_size;
432  __lasx_xvstelm_d(src0, block, 0, 2);
433  block += line_size;
434  __lasx_xvstelm_d(src0, block, 0, 3);
435  block += line_size;
436 
437  _src += stride4x;
438  }
439 }
440 
441 void ff_put_no_rnd_pixels8_xy2_8_lsx(uint8_t *block, const uint8_t *pixels,
442  ptrdiff_t line_size, int h)
443 {
444  __m128i src0, src1, src2, src3;
445  __m128i sum0, sum1, sum2;
446 
447  src0 = __lsx_vld(pixels, 0);
448  src1 = __lsx_vld(pixels, 1);
449  src2 = __lsx_vsllwil_hu_bu(src0, 0);
450  src3 = __lsx_vsllwil_hu_bu(src1, 0);
451  sum0 = __lsx_vadd_h(src2, src3);
452  sum0 = __lsx_vaddi_hu(sum0, 1);
453 
454  for (int i = 0; i < h; i++) {
455  pixels += line_size;
456  src0 = __lsx_vld(pixels, 0);
457  src1 = __lsx_vld(pixels, 1);
458  src2 = __lsx_vsllwil_hu_bu(src0, 0);
459  src3 = __lsx_vsllwil_hu_bu(src1, 0);
460  sum1 = __lsx_vadd_h(src2, src3);
461  sum2 = __lsx_vadd_h(sum0, sum1);
462  sum2 = __lsx_vsrani_b_h(sum2, sum2, 2);
463 
464  sum0 = __lsx_vaddi_hu(sum1, 1);
465  __lsx_vstelm_d(sum2, block, 0, 0);
466 
467  block += line_size;
468  }
469 }
470 
471 void ff_put_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels,
472  ptrdiff_t line_size, int h)
473 {
474  __m256i src0, src1, src2, src3;
475  __m256i sum0, sum1, sum2;
476 
477  src0 = __lasx_xvld(pixels, 0);
478  src1 = __lasx_xvld(pixels, 1);
479  src2 = __lasx_vext2xv_hu_bu(src0);
480  src3 = __lasx_vext2xv_hu_bu(src1);
481  sum0 = __lasx_xvadd_h(src2, src3);
482  sum0 = __lasx_xvaddi_hu(sum0, 2);
483 
484  for (int i = 0; i < h; i++) {
485  pixels += line_size;
486  src0 = __lasx_xvld(pixels, 0);
487  src1 = __lasx_xvld(pixels, 1);
488 
489  src2 = __lasx_vext2xv_hu_bu(src0);
490  src3 = __lasx_vext2xv_hu_bu(src1);
491  sum1 = __lasx_xvadd_h(src2, src3);
492  sum2 = __lasx_xvadd_h(sum0, sum1);
493  sum2 = __lasx_xvsrani_b_h(sum2, sum2, 2);
494  sum0 = __lasx_xvaddi_hu(sum1, 2);
495  __lasx_xvstelm_d(sum2, block, 0, 0);
496  __lasx_xvstelm_d(sum2, block, 8, 3);
497  block += line_size;
498  }
499 }
500 
501 static void common_hv_bil_8w_lasx(const uint8_t *src, int32_t src_stride,
502  uint8_t *dst, int32_t dst_stride,
503  uint8_t height)
504 {
505  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
506  __m256i src8, src9, sum0, sum1;
507  uint8_t loop_cnt;
508  int32_t src_stride_2x = src_stride << 1;
509  int32_t src_stride_4x = src_stride << 2;
510  int32_t dst_stride_2x = dst_stride << 1;
511  int32_t dst_stride_4x = dst_stride << 2;
512  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
513  int32_t src_stride_3x = src_stride_2x + src_stride;
514  uint8_t* _src = (uint8_t*)src;
515 
516  DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src0, src5);
517  _src += src_stride;
518 
519  for (loop_cnt = (height >> 2); loop_cnt--;) {
520  src1 = __lasx_xvld(_src, 0);
521  DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src2, src3);
522  src4 = __lasx_xvldx(_src, src_stride_3x);
523  _src += 1;
524  src6 = __lasx_xvld(_src, 0);
525  DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src7, src8);
526  src9 = __lasx_xvldx(_src, src_stride_3x);
527  _src += (src_stride_4x - 1);
528  DUP4_ARG2(__lasx_xvilvl_b, src5, src0, src6, src1, src7, src2, src8, src3,
529  src0, src1, src2, src3);
530  src5 = __lasx_xvilvl_b(src9, src4);
531  DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2,
532  0x20, src5, src3, 0x20, src0, src1, src2, src3);
533  DUP4_ARG2(__lasx_xvhaddw_hu_bu, src0, src0, src1, src1, src2, src2,
534  src3, src3, src0, src1, src2, src3);
535  DUP2_ARG2(__lasx_xvadd_h, src0, src1, src2, src3, sum0, sum1);
536  sum0 = __lasx_xvsrarni_b_h(sum1, sum0, 2);
537  __lasx_xvstelm_d(sum0, dst, 0, 0);
538  __lasx_xvstelm_d(sum0, dst + dst_stride, 0, 2);
539  __lasx_xvstelm_d(sum0, dst + dst_stride_2x, 0, 1);
540  __lasx_xvstelm_d(sum0, dst + dst_stride_3x, 0, 3);
541  dst += dst_stride_4x;
542  src0 = src4;
543  src5 = src9;
544  }
545 }
546 
547 void ff_put_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels,
548  ptrdiff_t line_size, int h)
549 {
550  common_hv_bil_8w_lasx(pixels, line_size, block, line_size, h);
551 }
put_pixels8_l2_8_lsx
static av_always_inline void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h)
Definition: hpeldsp_lasx.c:26
ff_put_no_rnd_pixels16_xy2_8_lasx
void ff_put_no_rnd_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:335
src1
const pixel * src1
Definition: h264pred_template.c:420
common_hv_bil_8w_lasx
static void common_hv_bil_8w_lasx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_lasx.c:501
ff_put_no_rnd_pixels16_y2_8_lasx
void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
For widths 16, h is always a positive multiple of 4.
Definition: hpeldsp_lasx.c:299
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_put_pixels16_8_lsx
void ff_put_pixels16_8_lsx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
For widths 16, h is always a positive multiple of 4.
Definition: hpeldsp_lasx.c:199
_src
uint8_t ptrdiff_t const uint8_t * _src
Definition: dsp.h:56
ff_put_pixels16_y2_8_lasx
void ff_put_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:245
ff_put_no_rnd_pixels8_xy2_8_lsx
void ff_put_no_rnd_pixels8_xy2_8_lsx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:441
ff_put_pixels8_x2_8_lasx
void ff_put_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:224
ff_put_no_rnd_pixels8_x2_8_lasx
void ff_put_no_rnd_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
For widths 8, h is always a positive multiple of 4.
Definition: hpeldsp_lasx.c:371
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
tmp
static uint8_t tmp[40]
Definition: aes_ctr.c:52
ff_put_pixels16_x2_8_lasx
void ff_put_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:238
ff_put_pixels8_xy2_8_lasx
void ff_put_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:547
ff_put_no_rnd_pixels8_y2_8_lasx
void ff_put_no_rnd_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
For widths 8, h is always a positive multiple of 4.
Definition: hpeldsp_lasx.c:410
put_pixels16_l2_8_lsx
static av_always_inline void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h)
Definition: hpeldsp_lasx.c:79
height
#define height
Definition: dsp.h:89
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
src2
const pixel * src2
Definition: h264pred_template.c:421
av_always_inline
#define av_always_inline
Definition: attributes.h:63
ff_put_no_rnd_pixels16_x2_8_lasx
void ff_put_no_rnd_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
For widths 16, h is always a positive multiple of 4.
Definition: hpeldsp_lasx.c:256
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
hpeldsp_lasx.h
src0
const pixel *const src0
Definition: h264pred_template.c:419
ff_put_pixels8_8_lasx
void ff_put_pixels8_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:133
loongson_intrinsics.h
ff_put_pixels16_xy2_8_lasx
void ff_put_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:471
int32_t
int32_t
Definition: audioconvert.c:56
ff_put_pixels8_y2_8_lasx
void ff_put_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_lasx.c:231
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2070
stride
#define stride
Definition: h264pred_template.c:536
src
#define src
Definition: vp8dsp.c:248
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83