31 uint8_t **
block,
int b_w,
int b_h,
int src_x,
32 int src_stride,
IDWTELEM *
const *lines,
33 int add, uint8_t *dst8);
38 const int w2= (
width+1)>>1;
39 const int w_l= (
width>>1);
40 const int w_r= w2 - 1;
49 "pcmpeqw %%mm7, %%mm7 \n\t"
50 "pcmpeqw %%mm3, %%mm3 \n\t"
51 "psllw $1, %%mm3 \n\t"
52 "paddw %%mm7, %%mm3 \n\t"
53 "psllw $13, %%mm3 \n\t"
57 "movq (%1), %%mm2 \n\t"
58 "movq 8(%1), %%mm6 \n\t"
59 "paddw 2(%1), %%mm2 \n\t"
60 "paddw 10(%1), %%mm6 \n\t"
61 "paddw %%mm7, %%mm2 \n\t"
62 "paddw %%mm7, %%mm6 \n\t"
63 "pmulhw %%mm3, %%mm2 \n\t"
64 "pmulhw %%mm3, %%mm6 \n\t"
65 "paddw (%0), %%mm2 \n\t"
66 "paddw 8(%0), %%mm6 \n\t"
67 "movq %%mm2, (%0) \n\t"
68 "movq %%mm6, 8(%0) \n\t"
69 ::
"r"(&
b[
i]),
"r"(&
ref[
i])
82 "movq (%1), %%mm2 \n\t"
83 "movq 8(%1), %%mm6 \n\t"
84 "paddw 2(%1), %%mm2 \n\t"
85 "paddw 10(%1), %%mm6 \n\t"
86 "movq (%0), %%mm0 \n\t"
87 "movq 8(%0), %%mm4 \n\t"
88 "psubw %%mm2, %%mm0 \n\t"
89 "psubw %%mm6, %%mm4 \n\t"
90 "movq %%mm0, (%0) \n\t"
91 "movq %%mm4, 8(%0) \n\t"
92 ::
"r"(&
dst[
i]),
"r"(&
b[
i])
105 "psllw $15, %%mm7 \n\t"
106 "pcmpeqw %%mm6, %%mm6 \n\t"
107 "psrlw $13, %%mm6 \n\t"
108 "paddw %%mm7, %%mm6 \n\t"
110 for(;
i<w_l-7;
i+=8){
112 "movq (%1), %%mm0 \n\t"
113 "movq 8(%1), %%mm4 \n\t"
114 "movq 2(%1), %%mm1 \n\t"
115 "movq 10(%1), %%mm5 \n\t"
116 "paddw %%mm6, %%mm0 \n\t"
117 "paddw %%mm6, %%mm4 \n\t"
118 "paddw %%mm7, %%mm1 \n\t"
119 "paddw %%mm7, %%mm5 \n\t"
120 "pavgw %%mm1, %%mm0 \n\t"
121 "pavgw %%mm5, %%mm4 \n\t"
122 "psubw %%mm7, %%mm0 \n\t"
123 "psubw %%mm7, %%mm4 \n\t"
124 "psraw $1, %%mm0 \n\t"
125 "psraw $1, %%mm4 \n\t"
126 "movq (%0), %%mm1 \n\t"
127 "movq 8(%0), %%mm5 \n\t"
128 "paddw %%mm1, %%mm0 \n\t"
129 "paddw %%mm5, %%mm4 \n\t"
130 "psraw $2, %%mm0 \n\t"
131 "psraw $2, %%mm4 \n\t"
132 "paddw %%mm1, %%mm0 \n\t"
133 "paddw %%mm5, %%mm4 \n\t"
134 "movq %%mm0, (%0) \n\t"
135 "movq %%mm4, 8(%0) \n\t"
136 ::
"r"(&
b[
i]),
"r"(&
ref[
i])
147 for(;
i<w_r-7;
i+=8){
149 "movq 2(%1), %%mm2 \n\t"
150 "movq 10(%1), %%mm6 \n\t"
151 "paddw (%1), %%mm2 \n\t"
152 "paddw 8(%1), %%mm6 \n\t"
153 "movq (%0), %%mm0 \n\t"
154 "movq 8(%0), %%mm4 \n\t"
155 "paddw %%mm2, %%mm0 \n\t"
156 "paddw %%mm6, %%mm4 \n\t"
157 "psraw $1, %%mm2 \n\t"
158 "psraw $1, %%mm6 \n\t"
159 "paddw %%mm0, %%mm2 \n\t"
160 "paddw %%mm4, %%mm6 \n\t"
161 "movq %%mm2, (%2) \n\t"
162 "movq %%mm6, 8(%2) \n\t"
173 for (; (
i & 0x1E) != 0x1E;
i-=2){
177 for (
i-=30;
i>=0;
i-=32){
179 "movq (%1), %%mm0 \n\t"
180 "movq 8(%1), %%mm2 \n\t"
181 "movq 16(%1), %%mm4 \n\t"
182 "movq 24(%1), %%mm6 \n\t"
183 "movq (%1), %%mm1 \n\t"
184 "movq 8(%1), %%mm3 \n\t"
185 "movq 16(%1), %%mm5 \n\t"
186 "movq 24(%1), %%mm7 \n\t"
187 "punpcklwd (%2), %%mm0 \n\t"
188 "punpcklwd 8(%2), %%mm2 \n\t"
189 "punpcklwd 16(%2), %%mm4 \n\t"
190 "punpcklwd 24(%2), %%mm6 \n\t"
191 "movq %%mm0, (%0) \n\t"
192 "movq %%mm2, 16(%0) \n\t"
193 "movq %%mm4, 32(%0) \n\t"
194 "movq %%mm6, 48(%0) \n\t"
195 "punpckhwd (%2), %%mm1 \n\t"
196 "punpckhwd 8(%2), %%mm3 \n\t"
197 "punpckhwd 16(%2), %%mm5 \n\t"
198 "punpckhwd 24(%2), %%mm7 \n\t"
199 "movq %%mm1, 8(%0) \n\t"
200 "movq %%mm3, 24(%0) \n\t"
201 "movq %%mm5, 40(%0) \n\t"
202 "movq %%mm7, 56(%0) \n\t"
203 ::
"r"(&
b[
i]),
"r"(&
b[
i>>1]),
"r"(&
temp[
i>>1])
211 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
212 "psubw %%"s0", %%"t0" \n\t"\
213 "psubw %%"s1", %%"t1" \n\t"\
214 "psubw %%"s2", %%"t2" \n\t"\
215 "psubw %%"s3", %%"t3" \n\t"
217 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
218 "psraw $"n", %%"t0" \n\t"\
219 "psraw $"n", %%"t1" \n\t"\
220 "psraw $"n", %%"t2" \n\t"\
221 "psraw $"n", %%"t3" \n\t"
223 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
224 "paddw %%"s0", %%"t0" \n\t"\
225 "paddw %%"s1", %%"t1" \n\t"\
226 "paddw %%"s2", %%"t2" \n\t"\
227 "paddw %%"s3", %%"t3" \n\t"
229 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
230 "pmulhw %%"s0", %%"t0" \n\t"\
231 "pmulhw %%"s1", %%"t1" \n\t"\
232 "pmulhw %%"s2", %%"t2" \n\t"\
233 "pmulhw %%"s3", %%"t3" \n\t"
235 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
236 ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
237 ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
238 ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
239 ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
241 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
242 snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
244 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
245 snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
247 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
248 "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
249 "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
250 "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
251 "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
253 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
254 "movq %%"s0", %%"t0" \n\t"\
255 "movq %%"s1", %%"t1" \n\t"\
256 "movq %%"s2", %%"t2" \n\t"\
257 "movq %%"s3", %%"t3" \n\t"
275 snow_vertical_compose_mmx_load(
"%4",
"mm1",
"mm3",
"mm5",
"mm7")
276 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
277 "pcmpeqw %%mm0, %%mm0 \n\t"
278 "pcmpeqw %%mm2, %%mm2 \n\t"
279 "paddw %%mm2, %%mm2 \n\t"
280 "paddw %%mm0, %%mm2 \n\t"
281 "psllw $13, %%mm2 \n\t"
282 snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
283 snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
284 snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
285 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
286 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
287 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
288 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
289 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
290 "pcmpeqw %%mm7, %%mm7 \n\t"
291 "pcmpeqw %%mm5, %%mm5 \n\t"
292 "psllw $15, %%mm7 \n\t"
293 "psrlw $13, %%mm5 \n\t"
294 "paddw %%mm7, %%mm5 \n\t"
295 snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
296 "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
297 "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
298 "paddw %%mm7, %%mm1 \n\t"
299 "paddw %%mm7, %%mm3 \n\t"
300 "pavgw %%mm1, %%mm0 \n\t"
301 "pavgw %%mm3, %%mm2 \n\t"
302 "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
303 "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
304 "paddw %%mm7, %%mm1 \n\t"
305 "paddw %%mm7, %%mm3 \n\t"
306 "pavgw %%mm1, %%mm4 \n\t"
307 "pavgw %%mm3, %%mm6 \n\t"
308 snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
309 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
310 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
312 snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
313 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
314 snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
315 snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
316 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
317 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
318 snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
319 snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
320 snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
323 "sub $32, %%"FF_REG_d" \n\t"
338 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
340 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
344 #if HAVE_SSSE3_EXTERNAL