32     const int w2= (
width+1)>>1;
 
   33     const int w_l= (
width>>1);
 
   34     const int w_r= w2 - 1;
 
   46             "pcmpeqd   %%xmm7, %%xmm7         \n\t" 
   47             "pcmpeqd   %%xmm3, %%xmm3         \n\t" 
   48             "psllw         $1, %%xmm3         \n\t" 
   49             "paddw     %%xmm7, %%xmm3         \n\t" 
   50             "psllw        $13, %%xmm3         \n\t" 
   52         for(; 
i<w_l-15; 
i+=16){
 
   54                 "movdqu   (%1), %%xmm1        \n\t" 
   55                 "movdqu 16(%1), %%xmm5        \n\t" 
   56                 "movdqu  2(%1), %%xmm2        \n\t" 
   57                 "movdqu 18(%1), %%xmm6        \n\t" 
   58                 "paddw  %%xmm1, %%xmm2        \n\t" 
   59                 "paddw  %%xmm5, %%xmm6        \n\t" 
   60                 "paddw  %%xmm7, %%xmm2        \n\t" 
   61                 "paddw  %%xmm7, %%xmm6        \n\t" 
   62                 "pmulhw %%xmm3, %%xmm2        \n\t" 
   63                 "pmulhw %%xmm3, %%xmm6        \n\t" 
   64                 "paddw    (%0), %%xmm2        \n\t" 
   65                 "paddw  16(%0), %%xmm6        \n\t" 
   66                 "movdqa %%xmm2, (%0)          \n\t" 
   67                 "movdqa %%xmm6, 16(%0)        \n\t" 
   68                 :: 
"r"(&
b[
i]), 
"r"(&
ref[
i])
 
   80         for(; (((
x86_reg)&dst[
i]) & 0x1F) && 
i<w_r; 
i++){
 
   81             dst[
i] = dst[
i] - (
b[
i] + 
b[
i + 1]);
 
   83         for(; 
i<w_r-15; 
i+=16){
 
   85                 "movdqu   (%1), %%xmm1        \n\t" 
   86                 "movdqu 16(%1), %%xmm5        \n\t" 
   87                 "movdqu  2(%1), %%xmm2        \n\t" 
   88                 "movdqu 18(%1), %%xmm6        \n\t" 
   89                 "paddw  %%xmm1, %%xmm2        \n\t" 
   90                 "paddw  %%xmm5, %%xmm6        \n\t" 
   91                 "movdqa   (%0), %%xmm0        \n\t" 
   92                 "movdqa 16(%0), %%xmm4        \n\t" 
   93                 "psubw  %%xmm2, %%xmm0        \n\t" 
   94                 "psubw  %%xmm6, %%xmm4        \n\t" 
   95                 "movdqa %%xmm0, (%0)          \n\t" 
   96                 "movdqa %%xmm4, 16(%0)        \n\t" 
   97                 :: 
"r"(&dst[
i]), 
"r"(&
b[
i])
 
  110             "psllw         $15, %%xmm7        \n\t" 
  111             "pcmpeqw    %%xmm6, %%xmm6        \n\t" 
  112             "psrlw         $13, %%xmm6        \n\t" 
  113             "paddw      %%xmm7, %%xmm6        \n\t" 
  115         for(; 
i<w_l-15; 
i+=16){
 
  117                 "movdqu   (%1), %%xmm0        \n\t" 
  118                 "movdqu 16(%1), %%xmm4        \n\t" 
  119                 "movdqu  2(%1), %%xmm1        \n\t" 
  120                 "movdqu 18(%1), %%xmm5        \n\t"  
  121                 "paddw  %%xmm6, %%xmm0        \n\t" 
  122                 "paddw  %%xmm6, %%xmm4        \n\t" 
  123                 "paddw  %%xmm7, %%xmm1        \n\t" 
  124                 "paddw  %%xmm7, %%xmm5        \n\t" 
  125                 "pavgw  %%xmm1, %%xmm0        \n\t" 
  126                 "pavgw  %%xmm5, %%xmm4        \n\t" 
  127                 "psubw  %%xmm7, %%xmm0        \n\t" 
  128                 "psubw  %%xmm7, %%xmm4        \n\t" 
  129                 "psraw      $1, %%xmm0        \n\t" 
  130                 "psraw      $1, %%xmm4        \n\t" 
  131                 "movdqa   (%0), %%xmm1        \n\t" 
  132                 "movdqa 16(%0), %%xmm5        \n\t" 
  133                 "paddw  %%xmm1, %%xmm0        \n\t" 
  134                 "paddw  %%xmm5, %%xmm4        \n\t" 
  135                 "psraw      $2, %%xmm0        \n\t" 
  136                 "psraw      $2, %%xmm4        \n\t" 
  137                 "paddw  %%xmm1, %%xmm0        \n\t" 
  138                 "paddw  %%xmm5, %%xmm4        \n\t" 
  139                 "movdqa %%xmm0, (%0)          \n\t" 
  140                 "movdqa %%xmm4, 16(%0)        \n\t" 
  141                 :: 
"r"(&
b[
i]), 
"r"(&
ref[
i])
 
  156         for(; 
i<w_r-7; 
i+=8){
 
  158                 "movdqu  2(%1), %%xmm2        \n\t" 
  159                 "movdqu 18(%1), %%xmm6        \n\t" 
  160                 "paddw    (%1), %%xmm2        \n\t" 
  161                 "paddw  16(%1), %%xmm6        \n\t" 
  162                 "movdqu   (%0), %%xmm0        \n\t" 
  163                 "movdqu 16(%0), %%xmm4        \n\t" 
  164                 "paddw  %%xmm2, %%xmm0        \n\t" 
  165                 "paddw  %%xmm6, %%xmm4        \n\t" 
  166                 "psraw      $1, %%xmm2        \n\t" 
  167                 "psraw      $1, %%xmm6        \n\t" 
  168                 "paddw  %%xmm0, %%xmm2        \n\t" 
  169                 "paddw  %%xmm4, %%xmm6        \n\t" 
  170                 "movdqa %%xmm2, (%2)          \n\t" 
  171                 "movdqa %%xmm6, 16(%2)        \n\t" 
  182         for (; (
i & 0x3E) != 0x3E; 
i-=2){
 
  186         for (
i-=62; 
i>=0; 
i-=64){
 
  188                 "movdqa      (%1), %%xmm0       \n\t" 
  189                 "movdqa    16(%1), %%xmm2       \n\t" 
  190                 "movdqa    32(%1), %%xmm4       \n\t" 
  191                 "movdqa    48(%1), %%xmm6       \n\t" 
  192                 "movdqa      (%1), %%xmm1       \n\t" 
  193                 "movdqa    16(%1), %%xmm3       \n\t" 
  194                 "movdqa    32(%1), %%xmm5       \n\t" 
  195                 "movdqa    48(%1), %%xmm7       \n\t" 
  196                 "punpcklwd   (%2), %%xmm0       \n\t" 
  197                 "punpcklwd 16(%2), %%xmm2       \n\t" 
  198                 "punpcklwd 32(%2), %%xmm4       \n\t" 
  199                 "punpcklwd 48(%2), %%xmm6       \n\t" 
  200                 "movdqa    %%xmm0, (%0)         \n\t" 
  201                 "movdqa    %%xmm2, 32(%0)       \n\t" 
  202                 "movdqa    %%xmm4, 64(%0)       \n\t" 
  203                 "movdqa    %%xmm6, 96(%0)       \n\t" 
  204                 "punpckhwd   (%2), %%xmm1       \n\t" 
  205                 "punpckhwd 16(%2), %%xmm3       \n\t" 
  206                 "punpckhwd 32(%2), %%xmm5       \n\t" 
  207                 "punpckhwd 48(%2), %%xmm7       \n\t" 
  208                 "movdqa    %%xmm1, 16(%0)       \n\t" 
  209                 "movdqa    %%xmm3, 48(%0)       \n\t" 
  210                 "movdqa    %%xmm5, 80(%0)       \n\t" 
  211                 "movdqa    %%xmm7, 112(%0)      \n\t" 
  212                 :: 
"r"(&(
b)[
i]), 
"r"(&(
b)[
i>>1]), 
"r"(&(
temp)[
i>>1])
 
  220     const int w2= (
width+1)>>1;
 
  221     const int w_l= (
width>>1);
 
  222     const int w_r= w2 - 1;
 
  231             "pcmpeqw    %%mm7, %%mm7         \n\t" 
  232             "pcmpeqw    %%mm3, %%mm3         \n\t" 
  233             "psllw         $1, %%mm3         \n\t" 
  234             "paddw      %%mm7, %%mm3         \n\t" 
  235             "psllw        $13, %%mm3         \n\t" 
  237         for(; 
i<w_l-7; 
i+=8){
 
  239                 "movq     (%1), %%mm2        \n\t" 
  240                 "movq    8(%1), %%mm6        \n\t" 
  241                 "paddw   2(%1), %%mm2        \n\t" 
  242                 "paddw  10(%1), %%mm6        \n\t" 
  243                 "paddw   %%mm7, %%mm2        \n\t" 
  244                 "paddw   %%mm7, %%mm6        \n\t" 
  245                 "pmulhw  %%mm3, %%mm2        \n\t" 
  246                 "pmulhw  %%mm3, %%mm6        \n\t" 
  247                 "paddw    (%0), %%mm2        \n\t" 
  248                 "paddw   8(%0), %%mm6        \n\t" 
  249                 "movq    %%mm2, (%0)         \n\t" 
  250                 "movq    %%mm6, 8(%0)        \n\t" 
  251                 :: 
"r"(&
b[
i]), 
"r"(&
ref[
i])
 
  262         for(; 
i<w_r-7; 
i+=8){
 
  264                 "movq     (%1), %%mm2        \n\t" 
  265                 "movq    8(%1), %%mm6        \n\t" 
  266                 "paddw   2(%1), %%mm2        \n\t" 
  267                 "paddw  10(%1), %%mm6        \n\t" 
  268                 "movq     (%0), %%mm0        \n\t" 
  269                 "movq    8(%0), %%mm4        \n\t" 
  270                 "psubw   %%mm2, %%mm0        \n\t" 
  271                 "psubw   %%mm6, %%mm4        \n\t" 
  272                 "movq    %%mm0, (%0)         \n\t" 
  273                 "movq    %%mm4, 8(%0)        \n\t" 
  274                 :: 
"r"(&dst[
i]), 
"r"(&
b[
i])
 
  287             "psllw         $15, %%mm7        \n\t" 
  288             "pcmpeqw     %%mm6, %%mm6        \n\t" 
  289             "psrlw         $13, %%mm6        \n\t" 
  290             "paddw       %%mm7, %%mm6        \n\t" 
  292         for(; 
i<w_l-7; 
i+=8){
 
  294                 "movq     (%1), %%mm0        \n\t" 
  295                 "movq    8(%1), %%mm4        \n\t" 
  296                 "movq    2(%1), %%mm1        \n\t" 
  297                 "movq   10(%1), %%mm5        \n\t" 
  298                 "paddw   %%mm6, %%mm0        \n\t" 
  299                 "paddw   %%mm6, %%mm4        \n\t" 
  300                 "paddw   %%mm7, %%mm1        \n\t" 
  301                 "paddw   %%mm7, %%mm5        \n\t" 
  302                 "pavgw   %%mm1, %%mm0        \n\t" 
  303                 "pavgw   %%mm5, %%mm4        \n\t" 
  304                 "psubw   %%mm7, %%mm0        \n\t" 
  305                 "psubw   %%mm7, %%mm4        \n\t" 
  306                 "psraw      $1, %%mm0        \n\t" 
  307                 "psraw      $1, %%mm4        \n\t" 
  308                 "movq     (%0), %%mm1        \n\t" 
  309                 "movq    8(%0), %%mm5        \n\t" 
  310                 "paddw   %%mm1, %%mm0        \n\t" 
  311                 "paddw   %%mm5, %%mm4        \n\t" 
  312                 "psraw      $2, %%mm0        \n\t" 
  313                 "psraw      $2, %%mm4        \n\t" 
  314                 "paddw   %%mm1, %%mm0        \n\t" 
  315                 "paddw   %%mm5, %%mm4        \n\t" 
  316                 "movq    %%mm0, (%0)         \n\t" 
  317                 "movq    %%mm4, 8(%0)        \n\t" 
  318                 :: 
"r"(&
b[
i]), 
"r"(&
ref[
i])
 
  329         for(; 
i<w_r-7; 
i+=8){
 
  331                 "movq    2(%1), %%mm2        \n\t" 
  332                 "movq   10(%1), %%mm6        \n\t" 
  333                 "paddw    (%1), %%mm2        \n\t" 
  334                 "paddw   8(%1), %%mm6        \n\t" 
  335                 "movq     (%0), %%mm0        \n\t" 
  336                 "movq    8(%0), %%mm4        \n\t" 
  337                 "paddw   %%mm2, %%mm0        \n\t" 
  338                 "paddw   %%mm6, %%mm4        \n\t" 
  339                 "psraw      $1, %%mm2        \n\t" 
  340                 "psraw      $1, %%mm6        \n\t" 
  341                 "paddw   %%mm0, %%mm2        \n\t" 
  342                 "paddw   %%mm4, %%mm6        \n\t" 
  343                 "movq    %%mm2, (%2)         \n\t" 
  344                 "movq    %%mm6, 8(%2)        \n\t" 
  355         for (; (
i & 0x1E) != 0x1E; 
i-=2){
 
  359         for (
i-=30; 
i>=0; 
i-=32){
 
  361                 "movq        (%1), %%mm0       \n\t" 
  362                 "movq       8(%1), %%mm2       \n\t" 
  363                 "movq      16(%1), %%mm4       \n\t" 
  364                 "movq      24(%1), %%mm6       \n\t" 
  365                 "movq        (%1), %%mm1       \n\t" 
  366                 "movq       8(%1), %%mm3       \n\t" 
  367                 "movq      16(%1), %%mm5       \n\t" 
  368                 "movq      24(%1), %%mm7       \n\t" 
  369                 "punpcklwd   (%2), %%mm0       \n\t" 
  370                 "punpcklwd  8(%2), %%mm2       \n\t" 
  371                 "punpcklwd 16(%2), %%mm4       \n\t" 
  372                 "punpcklwd 24(%2), %%mm6       \n\t" 
  373                 "movq       %%mm0, (%0)        \n\t" 
  374                 "movq       %%mm2, 16(%0)      \n\t" 
  375                 "movq       %%mm4, 32(%0)      \n\t" 
  376                 "movq       %%mm6, 48(%0)      \n\t" 
  377                 "punpckhwd   (%2), %%mm1       \n\t" 
  378                 "punpckhwd  8(%2), %%mm3       \n\t" 
  379                 "punpckhwd 16(%2), %%mm5       \n\t" 
  380                 "punpckhwd 24(%2), %%mm7       \n\t" 
  381                 "movq       %%mm1, 8(%0)       \n\t" 
  382                 "movq       %%mm3, 24(%0)      \n\t" 
  383                 "movq       %%mm5, 40(%0)      \n\t" 
  384                 "movq       %%mm7, 56(%0)      \n\t" 
  385                 :: 
"r"(&
b[
i]), 
"r"(&
b[
i>>1]), 
"r"(&
temp[
i>>1])
 
  393 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ 
  394         ""op" ("r",%%"FF_REG_d"), %%"t0"      \n\t"\ 
  395         ""op" 16("r",%%"FF_REG_d"), %%"t1"    \n\t"\ 
  396         ""op" 32("r",%%"FF_REG_d"), %%"t2"    \n\t"\ 
  397         ""op" 48("r",%%"FF_REG_d"), %%"t3"    \n\t" 
  399 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ 
  400         snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) 
  402 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ 
  403         snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) 
  405 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ 
  406         "psubw %%"s0", %%"t0" \n\t"\ 
  407         "psubw %%"s1", %%"t1" \n\t"\ 
  408         "psubw %%"s2", %%"t2" \n\t"\ 
  409         "psubw %%"s3", %%"t3" \n\t" 
  411 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ 
  412         "movdqa %%"s0", ("w",%%"FF_REG_d")    \n\t"\ 
  413         "movdqa %%"s1", 16("w",%%"FF_REG_d")  \n\t"\ 
  414         "movdqa %%"s2", 32("w",%%"FF_REG_d")  \n\t"\ 
  415         "movdqa %%"s3", 48("w",%%"FF_REG_d")  \n\t" 
  417 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ 
  418         "psraw $"n", %%"t0" \n\t"\ 
  419         "psraw $"n", %%"t1" \n\t"\ 
  420         "psraw $"n", %%"t2" \n\t"\ 
  421         "psraw $"n", %%"t3" \n\t" 
  423 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ 
  424         "paddw %%"s0", %%"t0" \n\t"\ 
  425         "paddw %%"s1", %%"t1" \n\t"\ 
  426         "paddw %%"s2", %%"t2" \n\t"\ 
  427         "paddw %%"s3", %%"t3" \n\t" 
  429 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ 
  430         "pmulhw %%"s0", %%"t0" \n\t"\ 
  431         "pmulhw %%"s1", %%"t1" \n\t"\ 
  432         "pmulhw %%"s2", %%"t2" \n\t"\ 
  433         "pmulhw %%"s3", %%"t3" \n\t" 
  435 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 
  436         "movdqa %%"s0", %%"t0" \n\t"\ 
  437         "movdqa %%"s1", %%"t1" \n\t"\ 
  438         "movdqa %%"s2", %%"t2" \n\t"\ 
  439         "movdqa %%"s3", %%"t3" \n\t" 
  457         snow_vertical_compose_sse2_load(
"%4",
"xmm0",
"xmm2",
"xmm4",
"xmm6")
 
  458         snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
 
  461         "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
 
  462         "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
 
  463         "paddw      %%xmm2, %%xmm2                   \n\t"
 
  464         "paddw      %%xmm0, %%xmm2                   \n\t"
 
  465         "psllw         $13, %%xmm2                   \n\t"
 
  466         snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
 
  467         snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
 
  468         snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
 
  469         snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
 
  470         snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
 
  471         snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
 
  472         snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
 
  473         snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
 
  475         "pcmpeqw %%xmm7, %%xmm7                      \n\t"
 
  476         "pcmpeqw %%xmm5, %%xmm5                      \n\t"
 
  477         "psllw $15, %%xmm7                           \n\t"
 
  478         "psrlw $13, %%xmm5                           \n\t"
 
  479         "paddw %%xmm7, %%xmm5                        \n\t"
 
  480         snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
 
  481         "movq   (%2,%%"FF_REG_d"), %%xmm1            \n\t"
 
  482         "movq  8(%2,%%"FF_REG_d"), %%xmm3            \n\t"
 
  483         "paddw %%xmm7, %%xmm1                        \n\t"
 
  484         "paddw %%xmm7, %%xmm3                        \n\t"
 
  485         "pavgw %%xmm1, %%xmm0                        \n\t"
 
  486         "pavgw %%xmm3, %%xmm2                        \n\t"
 
  487         "movq 16(%2,%%"FF_REG_d"), %%xmm1            \n\t"
 
  488         "movq 24(%2,%%"FF_REG_d"), %%xmm3            \n\t"
 
  489         "paddw %%xmm7, %%xmm1                        \n\t"
 
  490         "paddw %%xmm7, %%xmm3                        \n\t"
 
  491         "pavgw %%xmm1, %%xmm4                        \n\t"
 
  492         "pavgw %%xmm3, %%xmm6                        \n\t"
 
  493         snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
 
  494         snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
 
  495         snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
 
  497         snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
 
  498         snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
 
  499         snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
 
  500         snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
 
  501         snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
 
  502         snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
 
  503         snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
 
  504         snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
 
  505         snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
 
  508         "sub $64, %%"FF_REG_d"                       \n\t"
 
  514 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ 
  515         ""op" ("r",%%"FF_REG_d"), %%"t0"   \n\t"\ 
  516         ""op" 8("r",%%"FF_REG_d"), %%"t1"  \n\t"\ 
  517         ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\ 
  518         ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t" 
  520 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ 
  521         snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) 
  523 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ 
  524         snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) 
  526 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ 
  527         "movq %%"s0", ("w",%%"FF_REG_d")   \n\t"\ 
  528         "movq %%"s1", 8("w",%%"FF_REG_d")  \n\t"\ 
  529         "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\ 
  530         "movq %%"s3", 24("w",%%"FF_REG_d") \n\t" 
  532 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 
  533         "movq %%"s0", %%"t0" \n\t"\ 
  534         "movq %%"s1", %%"t1" \n\t"\ 
  535         "movq %%"s2", %%"t2" \n\t"\ 
  536         "movq %%"s3", %%"t3" \n\t" 
  554         snow_vertical_compose_mmx_load(
"%4",
"mm1",
"mm3",
"mm5",
"mm7")
 
  555         snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
 
  556         "pcmpeqw    %%mm0, %%mm0                     \n\t"
 
  557         "pcmpeqw    %%mm2, %%mm2                     \n\t"
 
  558         "paddw      %%mm2, %%mm2                     \n\t"
 
  559         "paddw      %%mm0, %%mm2                     \n\t"
 
  560         "psllw        $13, %%mm2                     \n\t"
 
  561         snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
 
  562         snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
 
  563         snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
 
  564         snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
 
  565         snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
 
  566         snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
 
  567         snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
 
  568         snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
 
  569         "pcmpeqw %%mm7, %%mm7                        \n\t"
 
  570         "pcmpeqw %%mm5, %%mm5                        \n\t"
 
  571         "psllw $15, %%mm7                            \n\t"
 
  572         "psrlw $13, %%mm5                            \n\t"
 
  573         "paddw %%mm7, %%mm5                          \n\t"
 
  574         snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
 
  575         "movq   (%2,%%"FF_REG_d"), %%mm1             \n\t"
 
  576         "movq  8(%2,%%"FF_REG_d"), %%mm3             \n\t"
 
  577         "paddw %%mm7, %%mm1                          \n\t"
 
  578         "paddw %%mm7, %%mm3                          \n\t"
 
  579         "pavgw %%mm1, %%mm0                          \n\t"
 
  580         "pavgw %%mm3, %%mm2                          \n\t"
 
  581         "movq 16(%2,%%"FF_REG_d"), %%mm1             \n\t"
 
  582         "movq 24(%2,%%"FF_REG_d"), %%mm3             \n\t"
 
  583         "paddw %%mm7, %%mm1                          \n\t"
 
  584         "paddw %%mm7, %%mm3                          \n\t"
 
  585         "pavgw %%mm1, %%mm4                          \n\t"
 
  586         "pavgw %%mm3, %%mm6                          \n\t"
 
  587         snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
 
  588         snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
 
  589         snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
 
  591         snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
 
  592         snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
 
  593         snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
 
  594         snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
 
  595         snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
 
  596         snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
 
  597         snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
 
  598         snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
 
  599         snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
 
  602         "sub $32, %%"FF_REG_d"                       \n\t"
 
  610 #define snow_inner_add_yblock_sse2_header \ 
  611     IDWTELEM * * dst_array = sb->line + src_y;\ 
  614              "mov  %7, %%"FF_REG_c"          \n\t"\ 
  616              "mov  %4, %%"FF_REG_S"          \n\t"\ 
  617              "pxor %%xmm7, %%xmm7            \n\t" \ 
  618              "pcmpeqd %%xmm3, %%xmm3         \n\t"\ 
  619              "psllw $15, %%xmm3              \n\t"\ 
  620              "psrlw $12, %%xmm3              \n\t" \ 
  622              "mov %1, %%"FF_REG_D"           \n\t"\ 
  623              "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ 
  624              "add %3, %%"FF_REG_D"           \n\t" 
  626 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ 
  627              "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 
  628              "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\ 
  629              "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2"             \n\t"\ 
  630              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ 
  631              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ 
  632              "movq "s_offset"(%%"FF_REG_S"), %%xmm0    \n\t"\ 
  633              "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\ 
  634              "punpcklbw %%xmm7, %%xmm0       \n\t"\ 
  635              "punpcklbw %%xmm7, %%xmm4       \n\t"\ 
  636              "pmullw %%xmm0, %%"out_reg1"    \n\t"\ 
  637              "pmullw %%xmm4, %%"out_reg2"    \n\t" 
  639 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ 
  640              "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 
  641              "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\ 
  642              "movq 8(%%"FF_REG_d"), %%"out_reg2"                          \n\t"\ 
  643              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ 
  644              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ 
  645              "movq "s_offset"(%%"FF_REG_S"), %%xmm0   \n\t"\ 
  646              "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\ 
  647              "punpcklbw %%xmm7, %%xmm0       \n\t"\ 
  648              "punpcklbw %%xmm7, %%xmm4       \n\t"\ 
  649              "pmullw %%xmm0, %%"out_reg1"    \n\t"\ 
  650              "pmullw %%xmm4, %%"out_reg2"    \n\t" 
  652 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ 
  653              snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ 
  654              "paddusw %%xmm2, %%xmm1         \n\t"\ 
  655              "paddusw %%xmm6, %%xmm5         \n\t" 
  657 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ 
  658              snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ 
  659              "paddusw %%xmm2, %%xmm1         \n\t"\ 
  660              "paddusw %%xmm6, %%xmm5         \n\t" 
  662 #define snow_inner_add_yblock_sse2_end_common1\ 
  663              "add $32, %%"FF_REG_S"                            \n\t"\ 
  664              "add %%"FF_REG_c", %0                             \n\t"\ 
  665              "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ 
  666              "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ 
  667              "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ 
  668              "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t" 
  670 #define snow_inner_add_yblock_sse2_end_common2\ 
  672              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ 
  674              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ 
  675              XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\ 
  676              "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); 
  678 #define snow_inner_add_yblock_sse2_end_8\ 
  679              "sal $1, %%"FF_REG_c"                \n\t"\ 
  680              "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\ 
  681              snow_inner_add_yblock_sse2_end_common1\ 
  682              "sar $1, %%"FF_REG_c"           \n\t"\ 
  684              snow_inner_add_yblock_sse2_end_common2 
  686 #define snow_inner_add_yblock_sse2_end_16\ 
  687              "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\ 
  688              snow_inner_add_yblock_sse2_end_common1\ 
  690              snow_inner_add_yblock_sse2_end_common2 
  692 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(
const uint8_t *obmc, 
const x86_reg obmc_stride, uint8_t * * 
block, 
int b_w, 
x86_reg b_h,
 
  693                       int src_x, 
int src_y, 
x86_reg src_stride, slice_buffer * sb, 
int add, uint8_t * dst8){
 
  694 snow_inner_add_yblock_sse2_header
 
  695 snow_inner_add_yblock_sse2_start_8(
"xmm1", 
"xmm5", 
"3", 
"0")
 
  696 snow_inner_add_yblock_sse2_accum_8("2", "8")
 
  697 snow_inner_add_yblock_sse2_accum_8("1", "128")
 
  698 snow_inner_add_yblock_sse2_accum_8("0", "136")
 
  700              "mov %0, %%"FF_REG_d"           \n\t"
 
  701              "movdqa (%%"FF_REG_D"), %%xmm0  \n\t"
 
  702              "movdqa %%xmm1, %%xmm2          \n\t"
 
  704              "punpckhwd %%xmm7, %%xmm1       \n\t"
 
  705              "punpcklwd %%xmm7, %%xmm2       \n\t"
 
  706              "paddd %%xmm2, %%xmm0           \n\t"
 
  707              "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
 
  708              "paddd %%xmm1, %%xmm2           \n\t"
 
  709              "paddd %%xmm3, %%xmm0           \n\t"
 
  710              "paddd %%xmm3, %%xmm2           \n\t"
 
  712              "mov %1, %%"FF_REG_D"           \n\t"
 
  713              "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
 
  714              "add %3, %%"FF_REG_D"           \n\t"
 
  716              "movdqa (%%"FF_REG_D"), %%xmm4  \n\t"
 
  717              "movdqa %%xmm5, %%xmm6          \n\t"
 
  718              "punpckhwd %%xmm7, %%xmm5       \n\t"
 
  719              "punpcklwd %%xmm7, %%xmm6       \n\t"
 
  720              "paddd %%xmm6, %%xmm4           \n\t"
 
  721              "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
 
  722              "paddd %%xmm5, %%xmm6           \n\t"
 
  723              "paddd %%xmm3, %%xmm4           \n\t"
 
  724              "paddd %%xmm3, %%xmm6           \n\t"
 
  726              "psrad $8, %%xmm0               \n\t" 
 
  727              "psrad $8, %%xmm2               \n\t" 
 
  728              "packssdw %%xmm2, %%xmm0        \n\t"
 
  729              "packuswb %%xmm7, %%xmm0        \n\t"
 
  730              "movq %%xmm0, (%%"FF_REG_d")    \n\t"
 
  732              "psrad $8, %%xmm4               \n\t" 
 
  733              "psrad $8, %%xmm6               \n\t" 
 
  734              "packssdw %%xmm6, %%xmm4        \n\t"
 
  735              "packuswb %%xmm7, %%xmm4        \n\t"
 
  736              "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
 
  737 snow_inner_add_yblock_sse2_end_8
 
  740 static 
void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const 
x86_reg obmc_stride, uint8_t * * 
block, 
int b_w, 
x86_reg b_h,
 
  741                       int src_x, 
int src_y, 
x86_reg src_stride, slice_buffer * sb, 
int add, uint8_t * dst8){
 
  742 snow_inner_add_yblock_sse2_header
 
  743 snow_inner_add_yblock_sse2_start_16(
"xmm1", 
"xmm5", 
"3", 
"0")
 
  744 snow_inner_add_yblock_sse2_accum_16("2", "16")
 
  745 snow_inner_add_yblock_sse2_accum_16("1", "512")
 
  746 snow_inner_add_yblock_sse2_accum_16("0", "528")
 
  748              "mov %0, %%"FF_REG_d"           \n\t"
 
  749              "psrlw $4, %%xmm1               \n\t"
 
  750              "psrlw $4, %%xmm5               \n\t"
 
  751              "paddw   (%%"FF_REG_D"), %%xmm1 \n\t"
 
  752              "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
 
  753              "paddw %%xmm3, %%xmm1           \n\t"
 
  754              "paddw %%xmm3, %%xmm5           \n\t"
 
  755              "psraw $4, %%xmm1               \n\t" 
 
  756              "psraw $4, %%xmm5               \n\t" 
 
  757              "packuswb %%xmm5, %%xmm1        \n\t"
 
  759              "movdqu %%xmm1, (%%"FF_REG_d")  \n\t"
 
  761 snow_inner_add_yblock_sse2_end_16
 
  764 #define snow_inner_add_yblock_mmx_header \ 
  765     IDWTELEM * * dst_array = sb->line + src_y;\ 
  768              "mov  %7, %%"FF_REG_c"          \n\t"\ 
  770              "mov  %4, %%"FF_REG_S"          \n\t"\ 
  771              "pxor %%mm7, %%mm7              \n\t" \ 
  772              "pcmpeqd %%mm3, %%mm3           \n\t"\ 
  773              "psllw $15, %%mm3               \n\t"\ 
  774              "psrlw $12, %%mm3               \n\t" \ 
  776              "mov %1, %%"FF_REG_D"           \n\t"\ 
  777              "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ 
  778              "add %3, %%"FF_REG_D"           \n\t" 
  780 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ 
  781              "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 
  782              "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1"                 \n\t"\ 
  783              "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2"               \n\t"\ 
  784              "punpcklbw %%mm7, %%"out_reg1" \n\t"\ 
  785              "punpcklbw %%mm7, %%"out_reg2" \n\t"\ 
  786              "movd "s_offset"(%%"FF_REG_S"), %%mm0   \n\t"\ 
  787              "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\ 
  788              "punpcklbw %%mm7, %%mm0       \n\t"\ 
  789              "punpcklbw %%mm7, %%mm4       \n\t"\ 
  790              "pmullw %%mm0, %%"out_reg1"   \n\t"\ 
  791              "pmullw %%mm4, %%"out_reg2"   \n\t" 
  793 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ 
  794              snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ 
  795              "paddusw %%mm2, %%mm1         \n\t"\ 
  796              "paddusw %%mm6, %%mm5         \n\t" 
  798 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ 
  799              "mov %0, %%"FF_REG_d"           \n\t"\ 
  800              "psrlw $4, %%mm1                \n\t"\ 
  801              "psrlw $4, %%mm5                \n\t"\ 
  802              "paddw "read_offset"(%%"FF_REG_D"), %%mm1   \n\t"\ 
  803              "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\ 
  804              "paddw %%mm3, %%mm1             \n\t"\ 
  805              "paddw %%mm3, %%mm5             \n\t"\ 
  806              "psraw $4, %%mm1                \n\t"\ 
  807              "psraw $4, %%mm5                \n\t"\ 
  808              "packuswb %%mm5, %%mm1          \n\t"\ 
  809              "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t" 
  811 #define snow_inner_add_yblock_mmx_end(s_step)\ 
  812              "add $"s_step", %%"FF_REG_S"                      \n\t"\ 
  813              "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ 
  814              "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ 
  815              "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ 
  816              "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"\ 
  817              "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1             \n\t"\ 
  818              "add %%"FF_REG_c", %0                             \n\t"\ 
  821              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ 
  823              "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ 
  824              "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); 
  826 static void inner_add_yblock_bw_8_obmc_16_mmx(
const uint8_t *obmc, 
const x86_reg obmc_stride, uint8_t * * 
block, 
int b_w, 
x86_reg b_h,
 
  827                       int src_x, 
int src_y, 
x86_reg src_stride, slice_buffer * sb, 
int add, uint8_t * dst8){
 
  828 snow_inner_add_yblock_mmx_header
 
  829 snow_inner_add_yblock_mmx_start(
"mm1", 
"mm5", 
"3", 
"0", 
"0")
 
  830 snow_inner_add_yblock_mmx_accum("2", "8", "0")
 
  831 snow_inner_add_yblock_mmx_accum("1", "128", "0")
 
  832 snow_inner_add_yblock_mmx_accum("0", "136", "0")
 
  833 snow_inner_add_yblock_mmx_mix("0", "0")
 
  834 snow_inner_add_yblock_mmx_end("16")
 
  837 static 
void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const 
x86_reg obmc_stride, uint8_t * * 
block, 
int b_w, 
x86_reg b_h,
 
  838                       int src_x, 
int src_y, 
x86_reg src_stride, slice_buffer * sb, 
int add, uint8_t * dst8){
 
  839 snow_inner_add_yblock_mmx_header
 
  840 snow_inner_add_yblock_mmx_start(
"mm1", 
"mm5", 
"3", 
"0", 
"0")
 
  841 snow_inner_add_yblock_mmx_accum("2", "16", "0")
 
  842 snow_inner_add_yblock_mmx_accum("1", "512", "0")
 
  843 snow_inner_add_yblock_mmx_accum("0", "528", "0")
 
  844 snow_inner_add_yblock_mmx_mix("0", "0")
 
  846 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
 
  847 snow_inner_add_yblock_mmx_accum("2", "24", "8")
 
  848 snow_inner_add_yblock_mmx_accum("1", "520", "8")
 
  849 snow_inner_add_yblock_mmx_accum("0", "536", "8")
 
  850 snow_inner_add_yblock_mmx_mix("16", "8")
 
  851 snow_inner_add_yblock_mmx_end("32")
 
  854 static 
void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const 
int obmc_stride, uint8_t * * 
block, 
int b_w, 
int b_h,
 
  855                            int src_x, 
int src_y, 
int src_stride, slice_buffer * sb, 
int add, uint8_t * dst8){
 
  858         inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  859     else if (b_w == 8 && obmc_stride == 16) {
 
  861             inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  863             inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  865          ff_snow_inner_add_yblock(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  868 static void ff_snow_inner_add_yblock_mmx(
const uint8_t *obmc, 
const int obmc_stride, uint8_t * * 
block, 
int b_w, 
int b_h,
 
  869                           int src_x, 
int src_y, 
int src_stride, slice_buffer * sb, 
int add, uint8_t * dst8){
 
  871         inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  872     else if (b_w == 8 && obmc_stride == 16)
 
  873         inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  875         ff_snow_inner_add_yblock(obmc, obmc_stride, 
block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
 
  888             c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
 
  890             c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
 
  893             c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
 
  898             c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
 
  900             c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
 
  904             c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;