Go to the documentation of this file.
82 #define hadamard_func(cpu) \
83 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
84 uint8_t *src2, ptrdiff_t stride, int h); \
85 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
86 uint8_t *src2, ptrdiff_t stride, int h);
100 score1 =
c->mecc.sse[0](
c, pix1, pix2,
stride,
h);
107 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
109 return score1 +
FFABS(score2) * 8;
120 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
122 return score1 +
FFABS(score2) * 8;
137 #define SUM(in0, in1, out0, out1) \
138 "movq (%0), %%mm2\n" \
139 "movq 8(%0), %%mm3\n" \
141 "movq %%mm2, " #out0 "\n" \
142 "movq %%mm3, " #out1 "\n" \
143 "psubusb " #in0 ", %%mm2\n" \
144 "psubusb " #in1 ", %%mm3\n" \
145 "psubusb " #out0 ", " #in0 "\n" \
146 "psubusb " #out1 ", " #in1 "\n" \
147 "por %%mm2, " #in0 "\n" \
148 "por %%mm3, " #in1 "\n" \
149 "movq " #in0 ", %%mm2\n" \
150 "movq " #in1 ", %%mm3\n" \
151 "punpcklbw %%mm7, " #in0 "\n" \
152 "punpcklbw %%mm7, " #in1 "\n" \
153 "punpckhbw %%mm7, %%mm2\n" \
154 "punpckhbw %%mm7, %%mm3\n" \
155 "paddw " #in1 ", " #in0 "\n" \
156 "paddw %%mm3, %%mm2\n" \
157 "paddw %%mm2, " #in0 "\n" \
158 "paddw " #in0 ", %%mm6\n"
163 "pxor %%mm6, %%mm6\n"
164 "pxor %%mm7, %%mm7\n"
166 "movq 8(%0), %%mm1\n"
171 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
173 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
178 "movq %%mm6, %%mm0\n"
180 "paddw %%mm6, %%mm0\n"
181 "movq %%mm0, %%mm6\n"
183 "paddw %%mm6, %%mm0\n"
185 :
"+r" (pix),
"=r" (
tmp)
202 #define SUM(in0, in1, out0, out1) \
203 "movq (%0), %%mm2\n" \
204 "movq (%1), " #out0 "\n" \
205 "movq 8(%0), %%mm3\n" \
206 "movq 8(%1), " #out1 "\n" \
209 "psubb " #out0 ", %%mm2\n" \
210 "psubb " #out1 ", %%mm3\n" \
211 "pxor %%mm7, %%mm2\n" \
212 "pxor %%mm7, %%mm3\n" \
213 "movq %%mm2, " #out0 "\n" \
214 "movq %%mm3, " #out1 "\n" \
215 "psubusb " #in0 ", %%mm2\n" \
216 "psubusb " #in1 ", %%mm3\n" \
217 "psubusb " #out0 ", " #in0 "\n" \
218 "psubusb " #out1 ", " #in1 "\n" \
219 "por %%mm2, " #in0 "\n" \
220 "por %%mm3, " #in1 "\n" \
221 "movq " #in0 ", %%mm2\n" \
222 "movq " #in1 ", %%mm3\n" \
223 "punpcklbw %%mm7, " #in0 "\n" \
224 "punpcklbw %%mm7, " #in1 "\n" \
225 "punpckhbw %%mm7, %%mm2\n" \
226 "punpckhbw %%mm7, %%mm3\n" \
227 "paddw " #in1 ", " #in0 "\n" \
228 "paddw %%mm3, %%mm2\n" \
229 "paddw %%mm2, " #in0 "\n" \
230 "paddw " #in0 ", %%mm6\n"
235 "pxor %%mm6, %%mm6\n"
236 "pcmpeqw %%mm7, %%mm7\n"
238 "packsswb %%mm7, %%mm7\n"
241 "movq 8(%0), %%mm1\n"
242 "movq 8(%1), %%mm3\n"
245 "psubb %%mm2, %%mm0\n"
246 "psubb %%mm3, %%mm1\n"
247 "pxor %%mm7, %%mm0\n"
248 "pxor %%mm7, %%mm1\n"
252 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
254 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
259 "movq %%mm6, %%mm0\n"
261 "paddw %%mm6, %%mm0\n"
262 "movq %%mm0, %%mm6\n"
264 "paddw %%mm6, %%mm0\n"
266 :
"+r" (pix1),
"+r" (pix2),
"=r" (
tmp)
275 0x0000000000000000ULL,
276 0x0001000100010001ULL,
277 0x0002000200020002ULL,
287 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
288 "movq (%2, %%"FF_REG_a
"), %%mm2 \n\t"
289 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t"
290 "add %3, %%"FF_REG_a
" \n\t"
291 "psubusb %%mm0, %%mm2 \n\t"
292 "psubusb %%mm4, %%mm0 \n\t"
293 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t"
294 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
295 "movq (%2, %%"FF_REG_a
"), %%mm5 \n\t"
296 "psubusb %%mm1, %%mm3 \n\t"
297 "psubusb %%mm5, %%mm1 \n\t"
298 "por %%mm2, %%mm0 \n\t"
299 "por %%mm1, %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm2 \n\t"
302 "punpcklbw %%mm7, %%mm0 \n\t"
303 "punpckhbw %%mm7, %%mm1 \n\t"
304 "punpcklbw %%mm7, %%mm3 \n\t"
305 "punpckhbw %%mm7, %%mm2 \n\t"
306 "paddw %%mm1, %%mm0 \n\t"
307 "paddw %%mm3, %%mm2 \n\t"
308 "paddw %%mm2, %%mm0 \n\t"
309 "paddw %%mm0, %%mm6 \n\t"
310 "add %3, %%"FF_REG_a
" \n\t"
323 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
324 "movq (%2, %%"FF_REG_a
"), %%mm1 \n\t"
325 "movq (%1, %%"FF_REG_a
"), %%mm2 \n\t"
326 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
327 "punpcklbw %%mm7, %%mm0 \n\t"
328 "punpcklbw %%mm7, %%mm1 \n\t"
329 "punpckhbw %%mm7, %%mm2 \n\t"
330 "punpckhbw %%mm7, %%mm3 \n\t"
331 "paddw %%mm0, %%mm1 \n\t"
332 "paddw %%mm2, %%mm3 \n\t"
333 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
334 "movq (%3, %%"FF_REG_a
"), %%mm2 \n\t"
335 "paddw %%mm5, %%mm1 \n\t"
336 "paddw %%mm5, %%mm3 \n\t"
337 "psrlw $1, %%mm1 \n\t"
338 "psrlw $1, %%mm3 \n\t"
339 "packuswb %%mm3, %%mm1 \n\t"
340 "psubusb %%mm1, %%mm4 \n\t"
341 "psubusb %%mm2, %%mm1 \n\t"
342 "por %%mm4, %%mm1 \n\t"
343 "movq %%mm1, %%mm0 \n\t"
344 "punpcklbw %%mm7, %%mm0 \n\t"
345 "punpckhbw %%mm7, %%mm1 \n\t"
346 "paddw %%mm1, %%mm0 \n\t"
347 "paddw %%mm0, %%mm6 \n\t"
348 "add %4, %%"FF_REG_a
" \n\t"
351 :
"r" (blk1a -
len),
"r" (blk1b -
len),
"r" (blk2 -
len),
360 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t"
361 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t"
362 "movq %%mm0, %%mm1 \n\t"
363 "movq %%mm2, %%mm3 \n\t"
364 "punpcklbw %%mm7, %%mm0 \n\t"
365 "punpckhbw %%mm7, %%mm1 \n\t"
366 "punpcklbw %%mm7, %%mm2 \n\t"
367 "punpckhbw %%mm7, %%mm3 \n\t"
368 "paddw %%mm2, %%mm0 \n\t"
369 "paddw %%mm3, %%mm1 \n\t"
372 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t"
373 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t"
374 "movq %%mm2, %%mm3 \n\t"
375 "movq %%mm4, %%mm5 \n\t"
376 "punpcklbw %%mm7, %%mm2 \n\t"
377 "punpckhbw %%mm7, %%mm3 \n\t"
378 "punpcklbw %%mm7, %%mm4 \n\t"
379 "punpckhbw %%mm7, %%mm5 \n\t"
380 "paddw %%mm4, %%mm2 \n\t"
381 "paddw %%mm5, %%mm3 \n\t"
382 "movq %5, %%mm5 \n\t"
383 "paddw %%mm2, %%mm0 \n\t"
384 "paddw %%mm3, %%mm1 \n\t"
385 "paddw %%mm5, %%mm0 \n\t"
386 "paddw %%mm5, %%mm1 \n\t"
387 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
388 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t"
389 "psrlw $2, %%mm0 \n\t"
390 "psrlw $2, %%mm1 \n\t"
391 "packuswb %%mm1, %%mm0 \n\t"
392 "psubusb %%mm0, %%mm4 \n\t"
393 "psubusb %%mm5, %%mm0 \n\t"
394 "por %%mm4, %%mm0 \n\t"
395 "movq %%mm0, %%mm4 \n\t"
396 "punpcklbw %%mm7, %%mm0 \n\t"
397 "punpckhbw %%mm7, %%mm4 \n\t"
398 "paddw %%mm0, %%mm6 \n\t"
399 "paddw %%mm4, %%mm6 \n\t"
400 "movq %%mm2, %%mm0 \n\t"
401 "movq %%mm3, %%mm1 \n\t"
402 "add %4, %%"FF_REG_a
" \n\t"
406 "r" (
stride),
"m" (round_tab[2]));
409 static inline int sum_mmx(
void)
413 "movq %%mm6, %%mm0 \n\t"
414 "psrlq $32, %%mm6 \n\t"
415 "paddw %%mm0, %%mm6 \n\t"
416 "movq %%mm6, %%mm0 \n\t"
417 "psrlq $16, %%mm6 \n\t"
418 "paddw %%mm0, %%mm6 \n\t"
419 "movd %%mm6, %0 \n\t"
427 sad8_2_mmx(blk1, blk1 + 1, blk2,
stride,
h);
436 #define PIX_SAD(suf) \
437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
438 uint8_t *blk1, ptrdiff_t stride, int h) \
440 av_assert2(h == 8); \
442 "pxor %%mm7, %%mm7 \n\t" \
443 "pxor %%mm6, %%mm6 \n\t" \
446 sad8_1_ ## suf(blk1, blk2, stride, 8); \
448 return sum_ ## suf(); \
451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
452 uint8_t *blk1, ptrdiff_t stride, int h) \
454 av_assert2(h == 8); \
456 "pxor %%mm7, %%mm7 \n\t" \
457 "pxor %%mm6, %%mm6 \n\t" \
458 "movq %0, %%mm5 \n\t" \
459 :: "m" (round_tab[1])); \
461 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
463 return sum_ ## suf(); \
466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
467 uint8_t *blk1, ptrdiff_t stride, int h) \
469 av_assert2(h == 8); \
471 "pxor %%mm7, %%mm7 \n\t" \
472 "pxor %%mm6, %%mm6 \n\t" \
473 "movq %0, %%mm5 \n\t" \
474 :: "m" (round_tab[1])); \
476 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
478 return sum_ ## suf(); \
481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
482 uint8_t *blk1, ptrdiff_t stride, int h) \
484 av_assert2(h == 8); \
486 "pxor %%mm7, %%mm7 \n\t" \
487 "pxor %%mm6, %%mm6 \n\t" \
490 sad8_4_ ## suf(blk1, blk2, stride, 8); \
492 return sum_ ## suf(); \
495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
496 uint8_t *blk1, ptrdiff_t stride, int h) \
499 "pxor %%mm7, %%mm7 \n\t" \
500 "pxor %%mm6, %%mm6 \n\t" \
503 sad8_1_ ## suf(blk1, blk2, stride, h); \
504 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
506 return sum_ ## suf(); \
509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
510 uint8_t *blk1, ptrdiff_t stride, int h) \
513 "pxor %%mm7, %%mm7 \n\t" \
514 "pxor %%mm6, %%mm6 \n\t" \
515 "movq %0, %%mm5 \n\t" \
516 :: "m" (round_tab[1])); \
518 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
519 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
521 return sum_ ## suf(); \
524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
525 uint8_t *blk1, ptrdiff_t stride, int h) \
528 "pxor %%mm7, %%mm7 \n\t" \
529 "pxor %%mm6, %%mm6 \n\t" \
530 "movq %0, %%mm5 \n\t" \
531 :: "m" (round_tab[1])); \
533 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
534 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
536 return sum_ ## suf(); \
539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
540 uint8_t *blk1, ptrdiff_t stride, int h) \
543 "pxor %%mm7, %%mm7 \n\t" \
544 "pxor %%mm6, %%mm6 \n\t" \
547 sad8_4_ ## suf(blk1, blk2, stride, h); \
548 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
550 return sum_ ## suf(); \
563 c->pix_abs[0][0] = sad16_mmx;
564 c->pix_abs[0][1] = sad16_x2_mmx;
565 c->pix_abs[0][2] = sad16_y2_mmx;
566 c->pix_abs[0][3] = sad16_xy2_mmx;
567 c->pix_abs[1][0] = sad8_mmx;
568 c->pix_abs[1][1] = sad8_x2_mmx;
569 c->pix_abs[1][2] = sad8_y2_mmx;
570 c->pix_abs[1][3] = sad8_xy2_mmx;
572 c->sad[0] = sad16_mmx;
573 c->sad[1] = sad8_mmx;
575 c->vsad[4] = vsad_intra16_mmx;
578 c->vsad[0] = vsad16_mmx;
585 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
591 c->nsse[0] = nsse16_mmx;
592 c->nsse[1] = nsse8_mmx;
597 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
627 #if HAVE_ALIGNED_STACK
628 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
647 #if HAVE_ALIGNED_STACK
648 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
#define INLINE_MMX(flags)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
static atomic_int cpu_flags
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
int ff_sum_abs_dctelem_sse2(int16_t *block)
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define EXTERNAL_SSE2(flags)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
main external API structure.
int ff_sum_abs_dctelem_mmx(int16_t *block)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
#define hadamard_func(cpu)
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
The exact code depends on how similar the blocks are and how related they are to the block
int ff_sum_abs_dctelem_mmxext(int16_t *block)
#define EXTERNAL_SSSE3(flags)
#define EXTERNAL_MMX(flags)
#define EXTERNAL_MMXEXT(flags)
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)