54 #define X8(x) x, x, x, x, x, x, x, x 
   63     0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
 
   64     0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
 
   65     0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
 
   66     0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
 
   70     0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
 
   71     0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
 
   72     0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
 
   73     0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
 
   77     0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
 
   78     0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
 
   79     0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
 
   80     0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
 
   84     0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
 
   85     0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
 
   86     0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
 
   87     0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
 
   91     65536, 65536, 65536, 65536,
 
   92      3597,  3597,  3597,  3597,
 
   93      2260,  2260,  2260,  2260,
 
   94      1203,  1203,  1203,  1203,
 
  100 #define ROW1 "%%xmm6" 
  101 #define ROW3 "%%xmm4" 
  102 #define ROW5 "%%xmm5" 
  103 #define ROW7 "%%xmm7" 
  105 #define CLEAR_ODD(r) "pxor  "r","r" \n\t" 
  106 #define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t" 
  110 # define ROW0 "%%xmm8" 
  112 # define ROW2 "%%xmm9" 
  114 # define ROW4 "%%xmm10" 
  116 # define ROW6 "%%xmm11" 
  118 # define CLEAR_EVEN(r) CLEAR_ODD(r) 
  119 # define PUT_EVEN(dst) PUT_ODD(dst) 
  120 # define XMMS "%%xmm12" 
  121 # define MOV_32_ONLY "#" 
  123 # define TAN3 "%%xmm13" 
  124 # define TAN1 "%%xmm14" 
  129 # define REG0 "%%xmm4" 
  130 # define ROW2 "2*16(%0)" 
  131 # define REG2 "%%xmm4" 
  132 # define ROW4 "4*16(%0)" 
  133 # define REG4 "%%xmm6" 
  134 # define ROW6 "6*16(%0)" 
  135 # define REG6 "%%xmm6" 
  136 # define CLEAR_EVEN(r) 
  137 # define PUT_EVEN(dst) \ 
  138     "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \ 
  139     "movdqa          %%xmm2, "dst"    \n\t" 
  140 # define XMMS "%%xmm2" 
  141 # define MOV_32_ONLY "movdqa " 
  142 # define SREG2 "%%xmm7" 
  143 # define TAN3 "%%xmm0" 
  144 # define TAN1 "%%xmm2" 
  148 #define ROUND(x) "paddd   "x 
  150 #define JZ(reg, to)                         \ 
  151     "testl     "reg","reg"            \n\t" \ 
  154 #define JNZ(reg, to)                        \ 
  155     "testl     "reg","reg"            \n\t" \ 
  158 #define TEST_ONE_ROW(src, reg, clear)       \ 
  160     "movq     "src", %%mm1            \n\t" \ 
  161     "por    8+"src", %%mm1            \n\t" \ 
  162     "paddusb  %%mm0, %%mm1            \n\t" \ 
  163     "pmovmskb %%mm1, "reg"            \n\t" 
  165 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ 
  168     "movq     "row1", %%mm1           \n\t"                   \ 
  169     "por    8+"row1", %%mm1           \n\t"                   \ 
  170     "movq     "row2", %%mm2           \n\t"                   \ 
  171     "por    8+"row2", %%mm2           \n\t"                   \ 
  172     "paddusb   %%mm0, %%mm1           \n\t"                   \ 
  173     "paddusb   %%mm0, %%mm2           \n\t"                   \ 
  174     "pmovmskb  %%mm1, "reg1"          \n\t"                   \ 
  175     "pmovmskb  %%mm2, "reg2"          \n\t" 
  178 #define iMTX_MULT(src, table, rounder, put)            \ 
  179     "movdqa        "src", %%xmm3      \n\t"            \ 
  180     "movdqa       %%xmm3, %%xmm0      \n\t"            \ 
  181     "pshufd   $0x11, %%xmm3, %%xmm1   \n\t"  \ 
  182     "punpcklqdq   %%xmm0, %%xmm0      \n\t"  \ 
  183     "pmaddwd     "table", %%xmm0      \n\t"            \ 
  184     "pmaddwd  16+"table", %%xmm1      \n\t"            \ 
  185     "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t"  \ 
  186     "punpckhqdq   %%xmm3, %%xmm3      \n\t"  \ 
  187     "pmaddwd  32+"table", %%xmm2      \n\t"            \ 
  188     "pmaddwd  48+"table", %%xmm3      \n\t"            \ 
  189     "paddd        %%xmm1, %%xmm0      \n\t"            \ 
  190     "paddd        %%xmm3, %%xmm2      \n\t"            \ 
  191     rounder",     %%xmm0              \n\t"            \ 
  192     "movdqa       %%xmm2, %%xmm3      \n\t"            \ 
  193     "paddd        %%xmm0, %%xmm2      \n\t"            \ 
  194     "psubd        %%xmm3, %%xmm0      \n\t"            \ 
  195     "psrad           $11, %%xmm2      \n\t"            \ 
  196     "psrad           $11, %%xmm0      \n\t"            \ 
  197     "packssdw     %%xmm0, %%xmm2      \n\t"            \ 
  202     "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \ 
  203     "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \ 
  206 #define iLLM_PASS(dct)                      \ 
  207     "movdqa   "TAN3", %%xmm1          \n\t" \ 
  208     "movdqa   "TAN1", %%xmm3          \n\t" \ 
  209     "pmulhw   %%xmm4, "TAN3"          \n\t" \ 
  210     "pmulhw   %%xmm5, %%xmm1          \n\t" \ 
  211     "paddsw   %%xmm4, "TAN3"          \n\t" \ 
  212     "paddsw   %%xmm5, %%xmm1          \n\t" \ 
  213     "psubsw   %%xmm5, "TAN3"          \n\t" \ 
  214     "paddsw   %%xmm4, %%xmm1          \n\t" \ 
  215     "pmulhw   %%xmm7, %%xmm3          \n\t" \ 
  216     "pmulhw   %%xmm6, "TAN1"          \n\t" \ 
  217     "paddsw   %%xmm6, %%xmm3          \n\t" \ 
  218     "psubsw   %%xmm7, "TAN1"          \n\t" \ 
  219     "movdqa   %%xmm3, %%xmm7          \n\t" \ 
  220     "movdqa   "TAN1", %%xmm6          \n\t" \ 
  221     "psubsw   %%xmm1, %%xmm3          \n\t" \ 
  222     "psubsw   "TAN3", "TAN1"          \n\t" \ 
  223     "paddsw   %%xmm7, %%xmm1          \n\t" \ 
  224     "paddsw   %%xmm6, "TAN3"          \n\t" \ 
  225     "movdqa   %%xmm3, %%xmm6          \n\t" \ 
  226     "psubsw   "TAN3", %%xmm3          \n\t" \ 
  227     "paddsw   %%xmm6, "TAN3"          \n\t" \ 
  228     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \ 
  229     "pmulhw   %%xmm4, %%xmm3          \n\t" \ 
  230     "pmulhw   %%xmm4, "TAN3"          \n\t" \ 
  231     "paddsw   "TAN3", "TAN3"          \n\t" \ 
  232     "paddsw   %%xmm3, %%xmm3          \n\t" \ 
  233     "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \ 
  234     MOV_32_ONLY ROW2", "REG2"         \n\t" \ 
  235     MOV_32_ONLY ROW6", "REG6"         \n\t" \ 
  236     "movdqa   %%xmm7, %%xmm5          \n\t" \ 
  237     "pmulhw   "REG6", %%xmm7          \n\t" \ 
  238     "pmulhw   "REG2", %%xmm5          \n\t" \ 
  239     "paddsw   "REG2", %%xmm7          \n\t" \ 
  240     "psubsw   "REG6", %%xmm5          \n\t" \ 
  241     MOV_32_ONLY ROW0", "REG0"         \n\t" \ 
  242     MOV_32_ONLY ROW4", "REG4"         \n\t" \ 
  243     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \ 
  244     "movdqa   "REG0", "XMMS"          \n\t" \ 
  245     "psubsw   "REG4", "REG0"          \n\t" \ 
  246     "paddsw   "XMMS", "REG4"          \n\t" \ 
  247     "movdqa   "REG4", "XMMS"          \n\t" \ 
  248     "psubsw   %%xmm7, "REG4"          \n\t" \ 
  249     "paddsw   "XMMS", %%xmm7          \n\t" \ 
  250     "movdqa   "REG0", "XMMS"          \n\t" \ 
  251     "psubsw   %%xmm5, "REG0"          \n\t" \ 
  252     "paddsw   "XMMS", %%xmm5          \n\t" \ 
  253     "movdqa   %%xmm5, "XMMS"          \n\t" \ 
  254     "psubsw   "TAN3", %%xmm5          \n\t" \ 
  255     "paddsw   "XMMS", "TAN3"          \n\t" \ 
  256     "movdqa   "REG0", "XMMS"          \n\t" \ 
  257     "psubsw   %%xmm3, "REG0"          \n\t" \ 
  258     "paddsw   "XMMS", %%xmm3          \n\t" \ 
  259     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \ 
  260     "psraw        $6, %%xmm5          \n\t" \ 
  261     "psraw        $6, "REG0"          \n\t" \ 
  262     "psraw        $6, "TAN3"          \n\t" \ 
  263     "psraw        $6, %%xmm3          \n\t" \ 
  264     "movdqa   "TAN3", 1*16("dct")     \n\t" \ 
  265     "movdqa   %%xmm3, 2*16("dct")     \n\t" \ 
  266     "movdqa   "REG0", 5*16("dct")     \n\t" \ 
  267     "movdqa   %%xmm5, 6*16("dct")     \n\t" \ 
  268     "movdqa   %%xmm7, %%xmm0          \n\t" \ 
  269     "movdqa   "REG4", %%xmm4          \n\t" \ 
  270     "psubsw   %%xmm1, %%xmm7          \n\t" \ 
  271     "psubsw   "TAN1", "REG4"          \n\t" \ 
  272     "paddsw   %%xmm0, %%xmm1          \n\t" \ 
  273     "paddsw   %%xmm4, "TAN1"          \n\t" \ 
  274     "psraw        $6, %%xmm1          \n\t" \ 
  275     "psraw        $6, %%xmm7          \n\t" \ 
  276     "psraw        $6, "TAN1"          \n\t" \ 
  277     "psraw        $6, "REG4"          \n\t" \ 
  278     "movdqa   %%xmm1, ("dct")         \n\t" \ 
  279     "movdqa   "TAN1", 3*16("dct")     \n\t" \ 
  280     "movdqa   "REG4", 4*16("dct")     \n\t" \ 
  281     "movdqa   %%xmm7, 7*16("dct")     \n\t" 
  284 #define iLLM_PASS_SPARSE(dct)               \ 
  285     "pmulhw   %%xmm4, "TAN3"          \n\t" \ 
  286     "paddsw   %%xmm4, "TAN3"          \n\t" \ 
  287     "movdqa   %%xmm6, %%xmm3          \n\t" \ 
  288     "pmulhw   %%xmm6, "TAN1"          \n\t" \ 
  289     "movdqa   %%xmm4, %%xmm1          \n\t" \ 
  290     "psubsw   %%xmm1, %%xmm3          \n\t" \ 
  291     "paddsw   %%xmm6, %%xmm1          \n\t" \ 
  292     "movdqa   "TAN1", %%xmm6          \n\t" \ 
  293     "psubsw   "TAN3", "TAN1"          \n\t" \ 
  294     "paddsw   %%xmm6, "TAN3"          \n\t" \ 
  295     "movdqa   %%xmm3, %%xmm6          \n\t" \ 
  296     "psubsw   "TAN3", %%xmm3          \n\t" \ 
  297     "paddsw   %%xmm6, "TAN3"          \n\t" \ 
  298     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \ 
  299     "pmulhw   %%xmm4, %%xmm3          \n\t" \ 
  300     "pmulhw   %%xmm4, "TAN3"          \n\t" \ 
  301     "paddsw   "TAN3", "TAN3"          \n\t" \ 
  302     "paddsw   %%xmm3, %%xmm3          \n\t" \ 
  303     "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \ 
  304     MOV_32_ONLY ROW2", "SREG2"        \n\t" \ 
  305     "pmulhw   "SREG2", %%xmm5         \n\t" \ 
  306     MOV_32_ONLY ROW0", "REG0"         \n\t" \ 
  307     "movdqa   "REG0", %%xmm6          \n\t" \ 
  308     "psubsw   "SREG2", %%xmm6         \n\t" \ 
  309     "paddsw   "REG0", "SREG2"         \n\t" \ 
  310     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \ 
  311     "movdqa   "REG0", "XMMS"          \n\t" \ 
  312     "psubsw   %%xmm5, "REG0"          \n\t" \ 
  313     "paddsw   "XMMS", %%xmm5          \n\t" \ 
  314     "movdqa   %%xmm5, "XMMS"          \n\t" \ 
  315     "psubsw   "TAN3", %%xmm5          \n\t" \ 
  316     "paddsw   "XMMS", "TAN3"          \n\t" \ 
  317     "movdqa   "REG0", "XMMS"          \n\t" \ 
  318     "psubsw   %%xmm3, "REG0"          \n\t" \ 
  319     "paddsw   "XMMS", %%xmm3          \n\t" \ 
  320     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \ 
  321     "psraw        $6, %%xmm5          \n\t" \ 
  322     "psraw        $6, "REG0"          \n\t" \ 
  323     "psraw        $6, "TAN3"          \n\t" \ 
  324     "psraw        $6, %%xmm3          \n\t" \ 
  325     "movdqa   "TAN3", 1*16("dct")     \n\t" \ 
  326     "movdqa   %%xmm3, 2*16("dct")     \n\t" \ 
  327     "movdqa   "REG0", 5*16("dct")     \n\t" \ 
  328     "movdqa   %%xmm5, 6*16("dct")     \n\t" \ 
  329     "movdqa   "SREG2", %%xmm0         \n\t" \ 
  330     "movdqa   %%xmm6, %%xmm4          \n\t" \ 
  331     "psubsw   %%xmm1, "SREG2"         \n\t" \ 
  332     "psubsw   "TAN1", %%xmm6          \n\t" \ 
  333     "paddsw   %%xmm0, %%xmm1          \n\t" \ 
  334     "paddsw   %%xmm4, "TAN1"          \n\t" \ 
  335     "psraw        $6, %%xmm1          \n\t" \ 
  336     "psraw        $6, "SREG2"         \n\t" \ 
  337     "psraw        $6, "TAN1"          \n\t" \ 
  338     "psraw        $6, %%xmm6          \n\t" \ 
  339     "movdqa   %%xmm1, ("dct")         \n\t" \ 
  340     "movdqa   "TAN1", 3*16("dct")     \n\t" \ 
  341     "movdqa   %%xmm6, 4*16("dct")     \n\t" \ 
  342     "movdqa   "SREG2", 7*16("dct")    \n\t" 
  347         "movq     "MANGLE (m127) 
", %%mm0                              \n\t" 
  348         iMTX_MULT(
"(%0)",     
MANGLE(iTab1), ROUND(
MANGLE(walkenIdctRounders)),        PUT_EVEN(ROW0))
 
  349         iMTX_MULT("1*16(%0)", 
MANGLE(iTab2), ROUND("1*16+"
MANGLE(walkenIdctRounders)), PUT_ODD(ROW1))
 
  350         iMTX_MULT("2*16(%0)", 
MANGLE(iTab3), ROUND("2*16+"
MANGLE(walkenIdctRounders)), PUT_EVEN(ROW2))
 
  352         TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
 
  354         iMTX_MULT("3*16(%0)", 
MANGLE(iTab4), ROUND("3*16+"
MANGLE(walkenIdctRounders)), PUT_ODD(ROW3))
 
  356         TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
 
  357         TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
 
  364         iLLM_PASS_SPARSE("%0")
 
  367         iMTX_MULT("4*16(%0)", 
MANGLE(iTab1), "
#", PUT_EVEN(ROW4)) 
  369         iMTX_MULT(
"5*16(%0)", 
MANGLE(iTab4), ROUND(
"4*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW5))
 
  372         iMTX_MULT("6*16(%0)", 
MANGLE(iTab3), ROUND("5*16+"
MANGLE(walkenIdctRounders)), PUT_EVEN(ROW6))
 
  375         iMTX_MULT("7*16(%0)", 
MANGLE(iTab2), ROUND("5*16+"
MANGLE(walkenIdctRounders)), PUT_ODD(ROW7))
 
  384                        "%xmm4", 
"%xmm5", 
"%xmm6", 
"%xmm7", )
 
  387                        "%xmm12", 
"%xmm13", 
"%xmm14", )
 
  389           "%eax", 
"%ecx", 
"%edx", 
"%esi", 
"memory");