FFmpeg
snowdsp_init.c
Go to the documentation of this file.
1 /*
2  * ASM optimized Snow DSP utils
3  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <stdint.h>
23 #include "config.h"
24 #include "libavutil/attributes.h"
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/snow_dwt.h"
29 
30 void ff_snow_inner_add_yblock_ssse3(const uint8_t *obmc, const int obmc_stride,
31  uint8_t **block, int b_w, int b_h, int src_x,
32  int src_stride, IDWTELEM *const *lines,
33  int add, uint8_t *dst8);
34 
35 #if HAVE_INLINE_ASM
36 
37 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
38  const int w2= (width+1)>>1;
39  const int w_l= (width>>1);
40  const int w_r= w2 - 1;
41  int i;
42 
43  { // Lift 0
44  IDWTELEM * const ref = b + w2 - 1;
45 
46  i = 1;
47  b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
48  __asm__ volatile(
49  "pcmpeqw %%mm7, %%mm7 \n\t"
50  "pcmpeqw %%mm3, %%mm3 \n\t"
51  "psllw $1, %%mm3 \n\t"
52  "paddw %%mm7, %%mm3 \n\t"
53  "psllw $13, %%mm3 \n\t"
54  ::);
55  for(; i<w_l-7; i+=8){
56  __asm__ volatile(
57  "movq (%1), %%mm2 \n\t"
58  "movq 8(%1), %%mm6 \n\t"
59  "paddw 2(%1), %%mm2 \n\t"
60  "paddw 10(%1), %%mm6 \n\t"
61  "paddw %%mm7, %%mm2 \n\t"
62  "paddw %%mm7, %%mm6 \n\t"
63  "pmulhw %%mm3, %%mm2 \n\t"
64  "pmulhw %%mm3, %%mm6 \n\t"
65  "paddw (%0), %%mm2 \n\t"
66  "paddw 8(%0), %%mm6 \n\t"
67  "movq %%mm2, (%0) \n\t"
68  "movq %%mm6, 8(%0) \n\t"
69  :: "r"(&b[i]), "r"(&ref[i])
70  : "memory"
71  );
72  }
74  }
75 
76  { // Lift 1
77  IDWTELEM * const dst = b+w2;
78 
79  i = 0;
80  for(; i<w_r-7; i+=8){
81  __asm__ volatile(
82  "movq (%1), %%mm2 \n\t"
83  "movq 8(%1), %%mm6 \n\t"
84  "paddw 2(%1), %%mm2 \n\t"
85  "paddw 10(%1), %%mm6 \n\t"
86  "movq (%0), %%mm0 \n\t"
87  "movq 8(%0), %%mm4 \n\t"
88  "psubw %%mm2, %%mm0 \n\t"
89  "psubw %%mm6, %%mm4 \n\t"
90  "movq %%mm0, (%0) \n\t"
91  "movq %%mm4, 8(%0) \n\t"
92  :: "r"(&dst[i]), "r"(&b[i])
93  : "memory"
94  );
95  }
97  }
98 
99  { // Lift 2
100  IDWTELEM * const ref = b+w2 - 1;
101 
102  i = 1;
103  b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
104  __asm__ volatile(
105  "psllw $15, %%mm7 \n\t"
106  "pcmpeqw %%mm6, %%mm6 \n\t"
107  "psrlw $13, %%mm6 \n\t"
108  "paddw %%mm7, %%mm6 \n\t"
109  ::);
110  for(; i<w_l-7; i+=8){
111  __asm__ volatile(
112  "movq (%1), %%mm0 \n\t"
113  "movq 8(%1), %%mm4 \n\t"
114  "movq 2(%1), %%mm1 \n\t"
115  "movq 10(%1), %%mm5 \n\t"
116  "paddw %%mm6, %%mm0 \n\t"
117  "paddw %%mm6, %%mm4 \n\t"
118  "paddw %%mm7, %%mm1 \n\t"
119  "paddw %%mm7, %%mm5 \n\t"
120  "pavgw %%mm1, %%mm0 \n\t"
121  "pavgw %%mm5, %%mm4 \n\t"
122  "psubw %%mm7, %%mm0 \n\t"
123  "psubw %%mm7, %%mm4 \n\t"
124  "psraw $1, %%mm0 \n\t"
125  "psraw $1, %%mm4 \n\t"
126  "movq (%0), %%mm1 \n\t"
127  "movq 8(%0), %%mm5 \n\t"
128  "paddw %%mm1, %%mm0 \n\t"
129  "paddw %%mm5, %%mm4 \n\t"
130  "psraw $2, %%mm0 \n\t"
131  "psraw $2, %%mm4 \n\t"
132  "paddw %%mm1, %%mm0 \n\t"
133  "paddw %%mm5, %%mm4 \n\t"
134  "movq %%mm0, (%0) \n\t"
135  "movq %%mm4, 8(%0) \n\t"
136  :: "r"(&b[i]), "r"(&ref[i])
137  : "memory"
138  );
139  }
141  }
142 
143  { // Lift 3
144  IDWTELEM * const src = b+w2;
145  i = 0;
146 
147  for(; i<w_r-7; i+=8){
148  __asm__ volatile(
149  "movq 2(%1), %%mm2 \n\t"
150  "movq 10(%1), %%mm6 \n\t"
151  "paddw (%1), %%mm2 \n\t"
152  "paddw 8(%1), %%mm6 \n\t"
153  "movq (%0), %%mm0 \n\t"
154  "movq 8(%0), %%mm4 \n\t"
155  "paddw %%mm2, %%mm0 \n\t"
156  "paddw %%mm6, %%mm4 \n\t"
157  "psraw $1, %%mm2 \n\t"
158  "psraw $1, %%mm6 \n\t"
159  "paddw %%mm0, %%mm2 \n\t"
160  "paddw %%mm4, %%mm6 \n\t"
161  "movq %%mm2, (%2) \n\t"
162  "movq %%mm6, 8(%2) \n\t"
163  :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
164  : "memory"
165  );
166  }
168  }
169 
170  {
172 
173  for (; (i & 0x1E) != 0x1E; i-=2){
174  b[i+1] = temp[i>>1];
175  b[i] = b[i>>1];
176  }
177  for (i-=30; i>=0; i-=32){
178  __asm__ volatile(
179  "movq (%1), %%mm0 \n\t"
180  "movq 8(%1), %%mm2 \n\t"
181  "movq 16(%1), %%mm4 \n\t"
182  "movq 24(%1), %%mm6 \n\t"
183  "movq (%1), %%mm1 \n\t"
184  "movq 8(%1), %%mm3 \n\t"
185  "movq 16(%1), %%mm5 \n\t"
186  "movq 24(%1), %%mm7 \n\t"
187  "punpcklwd (%2), %%mm0 \n\t"
188  "punpcklwd 8(%2), %%mm2 \n\t"
189  "punpcklwd 16(%2), %%mm4 \n\t"
190  "punpcklwd 24(%2), %%mm6 \n\t"
191  "movq %%mm0, (%0) \n\t"
192  "movq %%mm2, 16(%0) \n\t"
193  "movq %%mm4, 32(%0) \n\t"
194  "movq %%mm6, 48(%0) \n\t"
195  "punpckhwd (%2), %%mm1 \n\t"
196  "punpckhwd 8(%2), %%mm3 \n\t"
197  "punpckhwd 16(%2), %%mm5 \n\t"
198  "punpckhwd 24(%2), %%mm7 \n\t"
199  "movq %%mm1, 8(%0) \n\t"
200  "movq %%mm3, 24(%0) \n\t"
201  "movq %%mm5, 40(%0) \n\t"
202  "movq %%mm7, 56(%0) \n\t"
203  :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
204  : "memory"
205  );
206  }
207  }
208 }
209 
210 #if HAVE_7REGS
211 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
212  "psubw %%"s0", %%"t0" \n\t"\
213  "psubw %%"s1", %%"t1" \n\t"\
214  "psubw %%"s2", %%"t2" \n\t"\
215  "psubw %%"s3", %%"t3" \n\t"
216 
217 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
218  "psraw $"n", %%"t0" \n\t"\
219  "psraw $"n", %%"t1" \n\t"\
220  "psraw $"n", %%"t2" \n\t"\
221  "psraw $"n", %%"t3" \n\t"
222 
223 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
224  "paddw %%"s0", %%"t0" \n\t"\
225  "paddw %%"s1", %%"t1" \n\t"\
226  "paddw %%"s2", %%"t2" \n\t"\
227  "paddw %%"s3", %%"t3" \n\t"
228 
229 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
230  "pmulhw %%"s0", %%"t0" \n\t"\
231  "pmulhw %%"s1", %%"t1" \n\t"\
232  "pmulhw %%"s2", %%"t2" \n\t"\
233  "pmulhw %%"s3", %%"t3" \n\t"
234 
235 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
236  ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
237  ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
238  ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
239  ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
240 
241 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
242  snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
243 
244 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
245  snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
246 
247 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
248  "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
249  "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
250  "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
251  "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
252 
253 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
254  "movq %%"s0", %%"t0" \n\t"\
255  "movq %%"s1", %%"t1" \n\t"\
256  "movq %%"s2", %%"t2" \n\t"\
257  "movq %%"s3", %%"t3" \n\t"
258 
259 
260 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
261  x86_reg i = width;
262  while(i & 15)
263  {
264  i--;
265  b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
266  b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
267  b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
268  b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
269  }
270  i+=i;
271  __asm__ volatile(
272  "jmp 2f \n\t"
273  "1: \n\t"
274 
275  snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
276  snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
277  "pcmpeqw %%mm0, %%mm0 \n\t"
278  "pcmpeqw %%mm2, %%mm2 \n\t"
279  "paddw %%mm2, %%mm2 \n\t"
280  "paddw %%mm0, %%mm2 \n\t"
281  "psllw $13, %%mm2 \n\t"
282  snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
283  snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
284  snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
285  snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
286  snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
287  snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
288  snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
289  snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
290  "pcmpeqw %%mm7, %%mm7 \n\t"
291  "pcmpeqw %%mm5, %%mm5 \n\t"
292  "psllw $15, %%mm7 \n\t"
293  "psrlw $13, %%mm5 \n\t"
294  "paddw %%mm7, %%mm5 \n\t"
295  snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
296  "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
297  "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
298  "paddw %%mm7, %%mm1 \n\t"
299  "paddw %%mm7, %%mm3 \n\t"
300  "pavgw %%mm1, %%mm0 \n\t"
301  "pavgw %%mm3, %%mm2 \n\t"
302  "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
303  "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
304  "paddw %%mm7, %%mm1 \n\t"
305  "paddw %%mm7, %%mm3 \n\t"
306  "pavgw %%mm1, %%mm4 \n\t"
307  "pavgw %%mm3, %%mm6 \n\t"
308  snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
309  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
310  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
311 
312  snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
313  snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
314  snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
315  snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
316  snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
317  snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
318  snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
319  snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
320  snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
321 
322  "2: \n\t"
323  "sub $32, %%"FF_REG_d" \n\t"
324  "jge 1b \n\t"
325  :"+d"(i)
326  :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
327 }
328 #endif //HAVE_7REGS
329 
330 #endif /* HAVE_INLINE_ASM */
331 
333 {
334  int cpuflags = av_get_cpu_flags();
335 
336 #if HAVE_INLINE_ASM
337  if (INLINE_MMXEXT(cpuflags)) {
338  c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
339 #if HAVE_7REGS
340  c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
341 #endif
342  }
343 #endif /* HAVE_INLINE_ASM */
344 #if HAVE_SSSE3_EXTERNAL
345  if (EXTERNAL_SSSE3(cpuflags)) {
346  c->inner_add_yblock = ff_snow_inner_add_yblock_ssse3;
347  }
348 #endif
349 }
cpu.h
r
const char * r
Definition: vf_curves.c:127
W_AO
#define W_AO
Definition: snow_dwt.h:73
W_BO
#define W_BO
Definition: snow_dwt.h:78
W_DS
#define W_DS
Definition: snow_dwt.h:87
x86_reg
int x86_reg
Definition: asm.h:71
W_AM
#define W_AM
Definition: snow_dwt.h:72
b
#define b
Definition: input.c:42
SnowDWTContext
Definition: snow_dwt.h:56
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
W_CS
#define W_CS
Definition: snow_dwt.h:83
ff_dwt_init_x86
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
Definition: snowdsp_init.c:332
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2034
ff_snow_inner_add_yblock_ssse3
void ff_snow_inner_add_yblock_ssse3(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_stride, IDWTELEM *const *lines, int add, uint8_t *dst8)
av_cold
#define av_cold
Definition: attributes.h:111
b3
static double b3(void *priv, double x, double y)
Definition: vf_xfade.c:2036
W_DO
#define W_DO
Definition: snow_dwt.h:86
W_BM
#define W_BM
Definition: snow_dwt.h:77
asm.h
W_AS
#define W_AS
Definition: snow_dwt.h:74
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
W_DM
#define W_DM
Definition: snow_dwt.h:85
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
cpu.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:2035
attributes.h
snow_dwt.h
snow_horizontal_compose_lift_lead_out
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, const IDWTELEM *src, const IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
Definition: snow_dwt.h:114
W_BS
#define W_BS
Definition: snow_dwt.h:79
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:117
temp
else temp
Definition: vf_mcdeint.c:271
W_CM
#define W_CM
Definition: snow_dwt.h:81
INLINE_MMXEXT
#define INLINE_MMXEXT(flags)
Definition: cpu.h:81
snow_horizontal_compose_liftS_lead_out
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, const IDWTELEM *src, const IDWTELEM *ref, int width, int w)
Definition: snow_dwt.h:123
snow_interleave_line_header
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
Definition: snow_dwt.h:96
W_CO
#define W_CO
Definition: snow_dwt.h:82
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
IDWTELEM
short IDWTELEM
Definition: dirac_dwt.h:27
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:2033
width
#define width
Definition: dsp.h:89
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:59
src
#define src
Definition: vp8dsp.c:248