FFmpeg
snowdsp.c
Go to the documentation of this file.
1 /*
2  * MMX and SSE2 optimized snow DSP utils
3  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <stdint.h>
23 #include "config.h"
24 #include "libavutil/attributes.h"
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/snow.h"
28 #include "libavcodec/snow_dwt.h"
29 
30 #if HAVE_INLINE_ASM
31 
32 static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
33  const int w2= (width+1)>>1;
34  const int w_l= (width>>1);
35  const int w_r= w2 - 1;
36  int i;
37 
38  { // Lift 0
39  IDWTELEM * const ref = b + w2 - 1;
40  IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
41  // (the first time erroneously), we allow the SSE2 code to run an extra pass.
42  // The savings in code and time are well worth having to store this value and
43  // calculate b[0] correctly afterwards.
44 
45  i = 0;
46  __asm__ volatile(
47  "pcmpeqd %%xmm7, %%xmm7 \n\t"
48  "pcmpeqd %%xmm3, %%xmm3 \n\t"
49  "psllw $1, %%xmm3 \n\t"
50  "paddw %%xmm7, %%xmm3 \n\t"
51  "psllw $13, %%xmm3 \n\t"
52  ::);
53  for(; i<w_l-15; i+=16){
54  __asm__ volatile(
55  "movdqu (%1), %%xmm1 \n\t"
56  "movdqu 16(%1), %%xmm5 \n\t"
57  "movdqu 2(%1), %%xmm2 \n\t"
58  "movdqu 18(%1), %%xmm6 \n\t"
59  "paddw %%xmm1, %%xmm2 \n\t"
60  "paddw %%xmm5, %%xmm6 \n\t"
61  "paddw %%xmm7, %%xmm2 \n\t"
62  "paddw %%xmm7, %%xmm6 \n\t"
63  "pmulhw %%xmm3, %%xmm2 \n\t"
64  "pmulhw %%xmm3, %%xmm6 \n\t"
65  "paddw (%0), %%xmm2 \n\t"
66  "paddw 16(%0), %%xmm6 \n\t"
67  "movdqa %%xmm2, (%0) \n\t"
68  "movdqa %%xmm6, 16(%0) \n\t"
69  :: "r"(&b[i]), "r"(&ref[i])
70  : "memory"
71  );
72  }
74  b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
75  }
76 
77  { // Lift 1
78  IDWTELEM * const dst = b+w2;
79 
80  i = 0;
81  for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
82  dst[i] = dst[i] - (b[i] + b[i + 1]);
83  }
84  for(; i<w_r-15; i+=16){
85  __asm__ volatile(
86  "movdqu (%1), %%xmm1 \n\t"
87  "movdqu 16(%1), %%xmm5 \n\t"
88  "movdqu 2(%1), %%xmm2 \n\t"
89  "movdqu 18(%1), %%xmm6 \n\t"
90  "paddw %%xmm1, %%xmm2 \n\t"
91  "paddw %%xmm5, %%xmm6 \n\t"
92  "movdqa (%0), %%xmm0 \n\t"
93  "movdqa 16(%0), %%xmm4 \n\t"
94  "psubw %%xmm2, %%xmm0 \n\t"
95  "psubw %%xmm6, %%xmm4 \n\t"
96  "movdqa %%xmm0, (%0) \n\t"
97  "movdqa %%xmm4, 16(%0) \n\t"
98  :: "r"(&dst[i]), "r"(&b[i])
99  : "memory"
100  );
101  }
102  snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
103  }
104 
105  { // Lift 2
106  IDWTELEM * const ref = b+w2 - 1;
107  IDWTELEM b_0 = b[0];
108 
109  i = 0;
110  __asm__ volatile(
111  "psllw $15, %%xmm7 \n\t"
112  "pcmpeqw %%xmm6, %%xmm6 \n\t"
113  "psrlw $13, %%xmm6 \n\t"
114  "paddw %%xmm7, %%xmm6 \n\t"
115  ::);
116  for(; i<w_l-15; i+=16){
117  __asm__ volatile(
118  "movdqu (%1), %%xmm0 \n\t"
119  "movdqu 16(%1), %%xmm4 \n\t"
120  "movdqu 2(%1), %%xmm1 \n\t"
121  "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
122  "paddw %%xmm6, %%xmm0 \n\t"
123  "paddw %%xmm6, %%xmm4 \n\t"
124  "paddw %%xmm7, %%xmm1 \n\t"
125  "paddw %%xmm7, %%xmm5 \n\t"
126  "pavgw %%xmm1, %%xmm0 \n\t"
127  "pavgw %%xmm5, %%xmm4 \n\t"
128  "psubw %%xmm7, %%xmm0 \n\t"
129  "psubw %%xmm7, %%xmm4 \n\t"
130  "psraw $1, %%xmm0 \n\t"
131  "psraw $1, %%xmm4 \n\t"
132  "movdqa (%0), %%xmm1 \n\t"
133  "movdqa 16(%0), %%xmm5 \n\t"
134  "paddw %%xmm1, %%xmm0 \n\t"
135  "paddw %%xmm5, %%xmm4 \n\t"
136  "psraw $2, %%xmm0 \n\t"
137  "psraw $2, %%xmm4 \n\t"
138  "paddw %%xmm1, %%xmm0 \n\t"
139  "paddw %%xmm5, %%xmm4 \n\t"
140  "movdqa %%xmm0, (%0) \n\t"
141  "movdqa %%xmm4, 16(%0) \n\t"
142  :: "r"(&b[i]), "r"(&ref[i])
143  : "memory"
144  );
145  }
147  b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
148  }
149 
150  { // Lift 3
151  IDWTELEM * const src = b+w2;
152 
153  i = 0;
154  for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
155  temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
156  }
157  for(; i<w_r-7; i+=8){
158  __asm__ volatile(
159  "movdqu 2(%1), %%xmm2 \n\t"
160  "movdqu 18(%1), %%xmm6 \n\t"
161  "paddw (%1), %%xmm2 \n\t"
162  "paddw 16(%1), %%xmm6 \n\t"
163  "movdqu (%0), %%xmm0 \n\t"
164  "movdqu 16(%0), %%xmm4 \n\t"
165  "paddw %%xmm2, %%xmm0 \n\t"
166  "paddw %%xmm6, %%xmm4 \n\t"
167  "psraw $1, %%xmm2 \n\t"
168  "psraw $1, %%xmm6 \n\t"
169  "paddw %%xmm0, %%xmm2 \n\t"
170  "paddw %%xmm4, %%xmm6 \n\t"
171  "movdqa %%xmm2, (%2) \n\t"
172  "movdqa %%xmm6, 16(%2) \n\t"
173  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
174  : "memory"
175  );
176  }
178  }
179 
180  {
182 
183  for (; (i & 0x3E) != 0x3E; i-=2){
184  b[i+1] = temp[i>>1];
185  b[i] = b[i>>1];
186  }
187  for (i-=62; i>=0; i-=64){
188  __asm__ volatile(
189  "movdqa (%1), %%xmm0 \n\t"
190  "movdqa 16(%1), %%xmm2 \n\t"
191  "movdqa 32(%1), %%xmm4 \n\t"
192  "movdqa 48(%1), %%xmm6 \n\t"
193  "movdqa (%1), %%xmm1 \n\t"
194  "movdqa 16(%1), %%xmm3 \n\t"
195  "movdqa 32(%1), %%xmm5 \n\t"
196  "movdqa 48(%1), %%xmm7 \n\t"
197  "punpcklwd (%2), %%xmm0 \n\t"
198  "punpcklwd 16(%2), %%xmm2 \n\t"
199  "punpcklwd 32(%2), %%xmm4 \n\t"
200  "punpcklwd 48(%2), %%xmm6 \n\t"
201  "movdqa %%xmm0, (%0) \n\t"
202  "movdqa %%xmm2, 32(%0) \n\t"
203  "movdqa %%xmm4, 64(%0) \n\t"
204  "movdqa %%xmm6, 96(%0) \n\t"
205  "punpckhwd (%2), %%xmm1 \n\t"
206  "punpckhwd 16(%2), %%xmm3 \n\t"
207  "punpckhwd 32(%2), %%xmm5 \n\t"
208  "punpckhwd 48(%2), %%xmm7 \n\t"
209  "movdqa %%xmm1, 16(%0) \n\t"
210  "movdqa %%xmm3, 48(%0) \n\t"
211  "movdqa %%xmm5, 80(%0) \n\t"
212  "movdqa %%xmm7, 112(%0) \n\t"
213  :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
214  : "memory"
215  );
216  }
217  }
218 }
219 
220 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
221  const int w2= (width+1)>>1;
222  const int w_l= (width>>1);
223  const int w_r= w2 - 1;
224  int i;
225 
226  { // Lift 0
227  IDWTELEM * const ref = b + w2 - 1;
228 
229  i = 1;
230  b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
231  __asm__ volatile(
232  "pcmpeqw %%mm7, %%mm7 \n\t"
233  "pcmpeqw %%mm3, %%mm3 \n\t"
234  "psllw $1, %%mm3 \n\t"
235  "paddw %%mm7, %%mm3 \n\t"
236  "psllw $13, %%mm3 \n\t"
237  ::);
238  for(; i<w_l-7; i+=8){
239  __asm__ volatile(
240  "movq (%1), %%mm2 \n\t"
241  "movq 8(%1), %%mm6 \n\t"
242  "paddw 2(%1), %%mm2 \n\t"
243  "paddw 10(%1), %%mm6 \n\t"
244  "paddw %%mm7, %%mm2 \n\t"
245  "paddw %%mm7, %%mm6 \n\t"
246  "pmulhw %%mm3, %%mm2 \n\t"
247  "pmulhw %%mm3, %%mm6 \n\t"
248  "paddw (%0), %%mm2 \n\t"
249  "paddw 8(%0), %%mm6 \n\t"
250  "movq %%mm2, (%0) \n\t"
251  "movq %%mm6, 8(%0) \n\t"
252  :: "r"(&b[i]), "r"(&ref[i])
253  : "memory"
254  );
255  }
257  }
258 
259  { // Lift 1
260  IDWTELEM * const dst = b+w2;
261 
262  i = 0;
263  for(; i<w_r-7; i+=8){
264  __asm__ volatile(
265  "movq (%1), %%mm2 \n\t"
266  "movq 8(%1), %%mm6 \n\t"
267  "paddw 2(%1), %%mm2 \n\t"
268  "paddw 10(%1), %%mm6 \n\t"
269  "movq (%0), %%mm0 \n\t"
270  "movq 8(%0), %%mm4 \n\t"
271  "psubw %%mm2, %%mm0 \n\t"
272  "psubw %%mm6, %%mm4 \n\t"
273  "movq %%mm0, (%0) \n\t"
274  "movq %%mm4, 8(%0) \n\t"
275  :: "r"(&dst[i]), "r"(&b[i])
276  : "memory"
277  );
278  }
279  snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
280  }
281 
282  { // Lift 2
283  IDWTELEM * const ref = b+w2 - 1;
284 
285  i = 1;
286  b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
287  __asm__ volatile(
288  "psllw $15, %%mm7 \n\t"
289  "pcmpeqw %%mm6, %%mm6 \n\t"
290  "psrlw $13, %%mm6 \n\t"
291  "paddw %%mm7, %%mm6 \n\t"
292  ::);
293  for(; i<w_l-7; i+=8){
294  __asm__ volatile(
295  "movq (%1), %%mm0 \n\t"
296  "movq 8(%1), %%mm4 \n\t"
297  "movq 2(%1), %%mm1 \n\t"
298  "movq 10(%1), %%mm5 \n\t"
299  "paddw %%mm6, %%mm0 \n\t"
300  "paddw %%mm6, %%mm4 \n\t"
301  "paddw %%mm7, %%mm1 \n\t"
302  "paddw %%mm7, %%mm5 \n\t"
303  "pavgw %%mm1, %%mm0 \n\t"
304  "pavgw %%mm5, %%mm4 \n\t"
305  "psubw %%mm7, %%mm0 \n\t"
306  "psubw %%mm7, %%mm4 \n\t"
307  "psraw $1, %%mm0 \n\t"
308  "psraw $1, %%mm4 \n\t"
309  "movq (%0), %%mm1 \n\t"
310  "movq 8(%0), %%mm5 \n\t"
311  "paddw %%mm1, %%mm0 \n\t"
312  "paddw %%mm5, %%mm4 \n\t"
313  "psraw $2, %%mm0 \n\t"
314  "psraw $2, %%mm4 \n\t"
315  "paddw %%mm1, %%mm0 \n\t"
316  "paddw %%mm5, %%mm4 \n\t"
317  "movq %%mm0, (%0) \n\t"
318  "movq %%mm4, 8(%0) \n\t"
319  :: "r"(&b[i]), "r"(&ref[i])
320  : "memory"
321  );
322  }
324  }
325 
326  { // Lift 3
327  IDWTELEM * const src = b+w2;
328  i = 0;
329 
330  for(; i<w_r-7; i+=8){
331  __asm__ volatile(
332  "movq 2(%1), %%mm2 \n\t"
333  "movq 10(%1), %%mm6 \n\t"
334  "paddw (%1), %%mm2 \n\t"
335  "paddw 8(%1), %%mm6 \n\t"
336  "movq (%0), %%mm0 \n\t"
337  "movq 8(%0), %%mm4 \n\t"
338  "paddw %%mm2, %%mm0 \n\t"
339  "paddw %%mm6, %%mm4 \n\t"
340  "psraw $1, %%mm2 \n\t"
341  "psraw $1, %%mm6 \n\t"
342  "paddw %%mm0, %%mm2 \n\t"
343  "paddw %%mm4, %%mm6 \n\t"
344  "movq %%mm2, (%2) \n\t"
345  "movq %%mm6, 8(%2) \n\t"
346  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
347  : "memory"
348  );
349  }
351  }
352 
353  {
355 
356  for (; (i & 0x1E) != 0x1E; i-=2){
357  b[i+1] = temp[i>>1];
358  b[i] = b[i>>1];
359  }
360  for (i-=30; i>=0; i-=32){
361  __asm__ volatile(
362  "movq (%1), %%mm0 \n\t"
363  "movq 8(%1), %%mm2 \n\t"
364  "movq 16(%1), %%mm4 \n\t"
365  "movq 24(%1), %%mm6 \n\t"
366  "movq (%1), %%mm1 \n\t"
367  "movq 8(%1), %%mm3 \n\t"
368  "movq 16(%1), %%mm5 \n\t"
369  "movq 24(%1), %%mm7 \n\t"
370  "punpcklwd (%2), %%mm0 \n\t"
371  "punpcklwd 8(%2), %%mm2 \n\t"
372  "punpcklwd 16(%2), %%mm4 \n\t"
373  "punpcklwd 24(%2), %%mm6 \n\t"
374  "movq %%mm0, (%0) \n\t"
375  "movq %%mm2, 16(%0) \n\t"
376  "movq %%mm4, 32(%0) \n\t"
377  "movq %%mm6, 48(%0) \n\t"
378  "punpckhwd (%2), %%mm1 \n\t"
379  "punpckhwd 8(%2), %%mm3 \n\t"
380  "punpckhwd 16(%2), %%mm5 \n\t"
381  "punpckhwd 24(%2), %%mm7 \n\t"
382  "movq %%mm1, 8(%0) \n\t"
383  "movq %%mm3, 24(%0) \n\t"
384  "movq %%mm5, 40(%0) \n\t"
385  "movq %%mm7, 56(%0) \n\t"
386  :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
387  : "memory"
388  );
389  }
390  }
391 }
392 
393 #if HAVE_7REGS
394 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
395  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
396  ""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\
397  ""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\
398  ""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t"
399 
400 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
401  snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
402 
403 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
404  snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
405 
406 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
407  "psubw %%"s0", %%"t0" \n\t"\
408  "psubw %%"s1", %%"t1" \n\t"\
409  "psubw %%"s2", %%"t2" \n\t"\
410  "psubw %%"s3", %%"t3" \n\t"
411 
412 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
413  "movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\
414  "movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\
415  "movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\
416  "movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t"
417 
418 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
419  "psraw $"n", %%"t0" \n\t"\
420  "psraw $"n", %%"t1" \n\t"\
421  "psraw $"n", %%"t2" \n\t"\
422  "psraw $"n", %%"t3" \n\t"
423 
424 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
425  "paddw %%"s0", %%"t0" \n\t"\
426  "paddw %%"s1", %%"t1" \n\t"\
427  "paddw %%"s2", %%"t2" \n\t"\
428  "paddw %%"s3", %%"t3" \n\t"
429 
430 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
431  "pmulhw %%"s0", %%"t0" \n\t"\
432  "pmulhw %%"s1", %%"t1" \n\t"\
433  "pmulhw %%"s2", %%"t2" \n\t"\
434  "pmulhw %%"s3", %%"t3" \n\t"
435 
436 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
437  "movdqa %%"s0", %%"t0" \n\t"\
438  "movdqa %%"s1", %%"t1" \n\t"\
439  "movdqa %%"s2", %%"t2" \n\t"\
440  "movdqa %%"s3", %%"t3" \n\t"
441 
442 static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
443  x86_reg i = width;
444 
445  while(i & 0x1F)
446  {
447  i--;
448  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
449  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
450  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
451  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
452  }
453  i+=i;
454 
455  __asm__ volatile (
456  "jmp 2f \n\t"
457  "1: \n\t"
458  snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
459  snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
460 
461 
462  "pcmpeqw %%xmm0, %%xmm0 \n\t"
463  "pcmpeqw %%xmm2, %%xmm2 \n\t"
464  "paddw %%xmm2, %%xmm2 \n\t"
465  "paddw %%xmm0, %%xmm2 \n\t"
466  "psllw $13, %%xmm2 \n\t"
467  snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
468  snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
469  snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
470  snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
471  snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
472  snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
473  snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
474  snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
475 
476  "pcmpeqw %%xmm7, %%xmm7 \n\t"
477  "pcmpeqw %%xmm5, %%xmm5 \n\t"
478  "psllw $15, %%xmm7 \n\t"
479  "psrlw $13, %%xmm5 \n\t"
480  "paddw %%xmm7, %%xmm5 \n\t"
481  snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
482  "movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
483  "movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
484  "paddw %%xmm7, %%xmm1 \n\t"
485  "paddw %%xmm7, %%xmm3 \n\t"
486  "pavgw %%xmm1, %%xmm0 \n\t"
487  "pavgw %%xmm3, %%xmm2 \n\t"
488  "movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
489  "movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
490  "paddw %%xmm7, %%xmm1 \n\t"
491  "paddw %%xmm7, %%xmm3 \n\t"
492  "pavgw %%xmm1, %%xmm4 \n\t"
493  "pavgw %%xmm3, %%xmm6 \n\t"
494  snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
495  snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
496  snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
497 
498  snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
499  snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
500  snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
501  snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
502  snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
503  snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
504  snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
505  snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
506  snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
507 
508  "2: \n\t"
509  "sub $64, %%"FF_REG_d" \n\t"
510  "jge 1b \n\t"
511  :"+d"(i)
512  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
513 }
514 
515 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
516  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
517  ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
518  ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
519  ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
520 
521 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
522  snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
523 
524 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
525  snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
526 
527 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
528  "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
529  "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
530  "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
531  "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
532 
533 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
534  "movq %%"s0", %%"t0" \n\t"\
535  "movq %%"s1", %%"t1" \n\t"\
536  "movq %%"s2", %%"t2" \n\t"\
537  "movq %%"s3", %%"t3" \n\t"
538 
539 
540 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
541  x86_reg i = width;
542  while(i & 15)
543  {
544  i--;
545  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
546  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
547  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
548  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
549  }
550  i+=i;
551  __asm__ volatile(
552  "jmp 2f \n\t"
553  "1: \n\t"
554 
555  snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
556  snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
557  "pcmpeqw %%mm0, %%mm0 \n\t"
558  "pcmpeqw %%mm2, %%mm2 \n\t"
559  "paddw %%mm2, %%mm2 \n\t"
560  "paddw %%mm0, %%mm2 \n\t"
561  "psllw $13, %%mm2 \n\t"
562  snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
563  snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
564  snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
565  snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
566  snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
567  snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
568  snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
569  snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
570  "pcmpeqw %%mm7, %%mm7 \n\t"
571  "pcmpeqw %%mm5, %%mm5 \n\t"
572  "psllw $15, %%mm7 \n\t"
573  "psrlw $13, %%mm5 \n\t"
574  "paddw %%mm7, %%mm5 \n\t"
575  snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
576  "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
577  "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
578  "paddw %%mm7, %%mm1 \n\t"
579  "paddw %%mm7, %%mm3 \n\t"
580  "pavgw %%mm1, %%mm0 \n\t"
581  "pavgw %%mm3, %%mm2 \n\t"
582  "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
583  "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
584  "paddw %%mm7, %%mm1 \n\t"
585  "paddw %%mm7, %%mm3 \n\t"
586  "pavgw %%mm1, %%mm4 \n\t"
587  "pavgw %%mm3, %%mm6 \n\t"
588  snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
589  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
590  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
591 
592  snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
593  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
594  snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
595  snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
596  snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
597  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
598  snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
599  snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
600  snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
601 
602  "2: \n\t"
603  "sub $32, %%"FF_REG_d" \n\t"
604  "jge 1b \n\t"
605  :"+d"(i)
606  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
607 }
608 #endif //HAVE_7REGS
609 
610 #if HAVE_6REGS
611 #define snow_inner_add_yblock_sse2_header \
612  IDWTELEM * * dst_array = sb->line + src_y;\
613  x86_reg tmp;\
614  __asm__ volatile(\
615  "mov %7, %%"FF_REG_c" \n\t"\
616  "mov %6, %2 \n\t"\
617  "mov %4, %%"FF_REG_S" \n\t"\
618  "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
619  "pcmpeqd %%xmm3, %%xmm3 \n\t"\
620  "psllw $15, %%xmm3 \n\t"\
621  "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
622  "1: \n\t"\
623  "mov %1, %%"FF_REG_D" \n\t"\
624  "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
625  "add %3, %%"FF_REG_D" \n\t"
626 
627 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
628  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
629  "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
630  "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\
631  "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
632  "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
633  "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
634  "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
635  "punpcklbw %%xmm7, %%xmm0 \n\t"\
636  "punpcklbw %%xmm7, %%xmm4 \n\t"\
637  "pmullw %%xmm0, %%"out_reg1" \n\t"\
638  "pmullw %%xmm4, %%"out_reg2" \n\t"
639 
640 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
641  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
642  "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
643  "movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\
644  "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
645  "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
646  "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
647  "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
648  "punpcklbw %%xmm7, %%xmm0 \n\t"\
649  "punpcklbw %%xmm7, %%xmm4 \n\t"\
650  "pmullw %%xmm0, %%"out_reg1" \n\t"\
651  "pmullw %%xmm4, %%"out_reg2" \n\t"
652 
653 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
654  snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
655  "paddusw %%xmm2, %%xmm1 \n\t"\
656  "paddusw %%xmm6, %%xmm5 \n\t"
657 
658 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
659  snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
660  "paddusw %%xmm2, %%xmm1 \n\t"\
661  "paddusw %%xmm6, %%xmm5 \n\t"
662 
663 #define snow_inner_add_yblock_sse2_end_common1\
664  "add $32, %%"FF_REG_S" \n\t"\
665  "add %%"FF_REG_c", %0 \n\t"\
666  "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
667  "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
668  "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
669  "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"
670 
671 #define snow_inner_add_yblock_sse2_end_common2\
672  "jnz 1b \n\t"\
673  :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
674  :\
675  "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
676  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
677  "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
678 
679 #define snow_inner_add_yblock_sse2_end_8\
680  "sal $1, %%"FF_REG_c" \n\t"\
681  "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
682  snow_inner_add_yblock_sse2_end_common1\
683  "sar $1, %%"FF_REG_c" \n\t"\
684  "sub $2, %2 \n\t"\
685  snow_inner_add_yblock_sse2_end_common2
686 
687 #define snow_inner_add_yblock_sse2_end_16\
688  "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
689  snow_inner_add_yblock_sse2_end_common1\
690  "dec %2 \n\t"\
691  snow_inner_add_yblock_sse2_end_common2
692 
693 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
694  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
695 snow_inner_add_yblock_sse2_header
696 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
697 snow_inner_add_yblock_sse2_accum_8("2", "8")
698 snow_inner_add_yblock_sse2_accum_8("1", "128")
699 snow_inner_add_yblock_sse2_accum_8("0", "136")
700 
701  "mov %0, %%"FF_REG_d" \n\t"
702  "movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
703  "movdqa %%xmm1, %%xmm2 \n\t"
704 
705  "punpckhwd %%xmm7, %%xmm1 \n\t"
706  "punpcklwd %%xmm7, %%xmm2 \n\t"
707  "paddd %%xmm2, %%xmm0 \n\t"
708  "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
709  "paddd %%xmm1, %%xmm2 \n\t"
710  "paddd %%xmm3, %%xmm0 \n\t"
711  "paddd %%xmm3, %%xmm2 \n\t"
712 
713  "mov %1, %%"FF_REG_D" \n\t"
714  "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
715  "add %3, %%"FF_REG_D" \n\t"
716 
717  "movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
718  "movdqa %%xmm5, %%xmm6 \n\t"
719  "punpckhwd %%xmm7, %%xmm5 \n\t"
720  "punpcklwd %%xmm7, %%xmm6 \n\t"
721  "paddd %%xmm6, %%xmm4 \n\t"
722  "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
723  "paddd %%xmm5, %%xmm6 \n\t"
724  "paddd %%xmm3, %%xmm4 \n\t"
725  "paddd %%xmm3, %%xmm6 \n\t"
726 
727  "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
728  "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
729  "packssdw %%xmm2, %%xmm0 \n\t"
730  "packuswb %%xmm7, %%xmm0 \n\t"
731  "movq %%xmm0, (%%"FF_REG_d") \n\t"
732 
733  "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
734  "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
735  "packssdw %%xmm6, %%xmm4 \n\t"
736  "packuswb %%xmm7, %%xmm4 \n\t"
737  "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
738 snow_inner_add_yblock_sse2_end_8
739 }
740 
741 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
742  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
743 snow_inner_add_yblock_sse2_header
744 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
745 snow_inner_add_yblock_sse2_accum_16("2", "16")
746 snow_inner_add_yblock_sse2_accum_16("1", "512")
747 snow_inner_add_yblock_sse2_accum_16("0", "528")
748 
749  "mov %0, %%"FF_REG_d" \n\t"
750  "psrlw $4, %%xmm1 \n\t"
751  "psrlw $4, %%xmm5 \n\t"
752  "paddw (%%"FF_REG_D"), %%xmm1 \n\t"
753  "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
754  "paddw %%xmm3, %%xmm1 \n\t"
755  "paddw %%xmm3, %%xmm5 \n\t"
756  "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
757  "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
758  "packuswb %%xmm5, %%xmm1 \n\t"
759 
760  "movdqu %%xmm1, (%%"FF_REG_d") \n\t"
761 
762 snow_inner_add_yblock_sse2_end_16
763 }
764 
765 #define snow_inner_add_yblock_mmx_header \
766  IDWTELEM * * dst_array = sb->line + src_y;\
767  x86_reg tmp;\
768  __asm__ volatile(\
769  "mov %7, %%"FF_REG_c" \n\t"\
770  "mov %6, %2 \n\t"\
771  "mov %4, %%"FF_REG_S" \n\t"\
772  "pxor %%mm7, %%mm7 \n\t" /* 0 */\
773  "pcmpeqd %%mm3, %%mm3 \n\t"\
774  "psllw $15, %%mm3 \n\t"\
775  "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
776  "1: \n\t"\
777  "mov %1, %%"FF_REG_D" \n\t"\
778  "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
779  "add %3, %%"FF_REG_D" \n\t"
780 
781 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
782  "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
783  "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\
784  "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\
785  "punpcklbw %%mm7, %%"out_reg1" \n\t"\
786  "punpcklbw %%mm7, %%"out_reg2" \n\t"\
787  "movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\
788  "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
789  "punpcklbw %%mm7, %%mm0 \n\t"\
790  "punpcklbw %%mm7, %%mm4 \n\t"\
791  "pmullw %%mm0, %%"out_reg1" \n\t"\
792  "pmullw %%mm4, %%"out_reg2" \n\t"
793 
794 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
795  snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
796  "paddusw %%mm2, %%mm1 \n\t"\
797  "paddusw %%mm6, %%mm5 \n\t"
798 
799 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
800  "mov %0, %%"FF_REG_d" \n\t"\
801  "psrlw $4, %%mm1 \n\t"\
802  "psrlw $4, %%mm5 \n\t"\
803  "paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\
804  "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
805  "paddw %%mm3, %%mm1 \n\t"\
806  "paddw %%mm3, %%mm5 \n\t"\
807  "psraw $4, %%mm1 \n\t"\
808  "psraw $4, %%mm5 \n\t"\
809  "packuswb %%mm5, %%mm1 \n\t"\
810  "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
811 
812 #define snow_inner_add_yblock_mmx_end(s_step)\
813  "add $"s_step", %%"FF_REG_S" \n\t"\
814  "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
815  "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
816  "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
817  "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\
818  "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\
819  "add %%"FF_REG_c", %0 \n\t"\
820  "dec %2 \n\t"\
821  "jnz 1b \n\t"\
822  :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
823  :\
824  "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
825  "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
826 
827 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
828  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
829 snow_inner_add_yblock_mmx_header
830 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
831 snow_inner_add_yblock_mmx_accum("2", "8", "0")
832 snow_inner_add_yblock_mmx_accum("1", "128", "0")
833 snow_inner_add_yblock_mmx_accum("0", "136", "0")
834 snow_inner_add_yblock_mmx_mix("0", "0")
835 snow_inner_add_yblock_mmx_end("16")
836 }
837 
838 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
839  int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
840 snow_inner_add_yblock_mmx_header
841 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
842 snow_inner_add_yblock_mmx_accum("2", "16", "0")
843 snow_inner_add_yblock_mmx_accum("1", "512", "0")
844 snow_inner_add_yblock_mmx_accum("0", "528", "0")
845 snow_inner_add_yblock_mmx_mix("0", "0")
846 
847 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
848 snow_inner_add_yblock_mmx_accum("2", "24", "8")
849 snow_inner_add_yblock_mmx_accum("1", "520", "8")
850 snow_inner_add_yblock_mmx_accum("0", "536", "8")
851 snow_inner_add_yblock_mmx_mix("16", "8")
852 snow_inner_add_yblock_mmx_end("32")
853 }
854 
855 static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
856  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
857 
858  if (b_w == 16)
859  inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
860  else if (b_w == 8 && obmc_stride == 16) {
861  if (!(b_h & 1))
862  inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
863  else
864  inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
865  } else
866  ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
867 }
868 
869 static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
870  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
871  if (b_w == 16)
872  inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
873  else if (b_w == 8 && obmc_stride == 16)
874  inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
875  else
876  ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
877 }
878 #endif /* HAVE_6REGS */
879 
880 #endif /* HAVE_INLINE_ASM */
881 
883 {
884 #if HAVE_INLINE_ASM
885  int mm_flags = av_get_cpu_flags();
886 
887  if (mm_flags & AV_CPU_FLAG_MMX) {
888  if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
889  c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
890 #if HAVE_7REGS
891  c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
892 #endif
893 #if HAVE_6REGS
894  c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
895 #endif
896  }
897  else{
898  if (mm_flags & AV_CPU_FLAG_MMXEXT) {
899  c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
900 #if HAVE_7REGS
901  c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
902 #endif
903  }
904 #if HAVE_6REGS
905  c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
906 #endif
907  }
908  }
909 #endif /* HAVE_INLINE_ASM */
910 }
r
const char * r
Definition: vf_curves.c:116
W_AO
#define W_AO
Definition: snow_dwt.h:73
W_BO
#define W_BO
Definition: snow_dwt.h:78
sub
static float sub(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:31
W_DS
#define W_DS
Definition: snow_dwt.h:87
W_AM
#define W_AM
Definition: snow_dwt.h:72
b
#define b
Definition: input.c:34
SnowDWTContext
Definition: snow_dwt.h:56
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:101
W_CS
#define W_CS
Definition: snow_dwt.h:83
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:1771
av_cold
#define av_cold
Definition: attributes.h:90
width
#define width
b3
static double b3(void *priv, double x, double y)
Definition: vf_xfade.c:1773
W_DO
#define W_DO
Definition: snow_dwt.h:86
W_BM
#define W_BM
Definition: snow_dwt.h:77
snow_horizontal_compose_lift_lead_out
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
Definition: snow.h:221
ff_dwt_init_x86
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
Definition: snowdsp.c:882
ff_snow_inner_add_yblock
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
Definition: snow.c:37
snow.h
W_AS
#define W_AS
Definition: snow_dwt.h:74
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
W_DM
#define W_DM
Definition: snow_dwt.h:85
AV_CPU_FLAG_SSE2
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:34
cpu.h
asm.h
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:1772
attributes.h
snow_dwt.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
W_BS
#define W_BS
Definition: snow_dwt.h:79
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
AV_CPU_FLAG_MMX
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:29
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:112
temp
else temp
Definition: vf_mcdeint.c:248
snow_horizontal_compose_liftS_lead_out
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w)
Definition: snow.h:231
W_CM
#define W_CM
Definition: snow_dwt.h:81
AV_CPU_FLAG_MMXEXT
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:30
add
static float add(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:35
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
x86_reg
int x86_reg
Definition: asm.h:72
W_CO
#define W_CO
Definition: snow_dwt.h:82
d
d
Definition: ffmpeg_filter.c:153
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
IDWTELEM
short IDWTELEM
Definition: dirac_dwt.h:27
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:1770
snow_interleave_line_header
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
Definition: snow.h:205