00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/avcodec.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "dsputil_mmx.h"
00031 
00032 extern uint16_t ff_inv_zigzag_direct16[64];
00033 
00034 
00035 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00036                                   DCTELEM *block, int n, int qscale)
00037 {
00038     x86_reg level, qmul, qadd, nCoeffs;
00039 
00040     qmul = qscale << 1;
00041 
00042     assert(s->block_last_index[n]>=0 || s->h263_aic);
00043 
00044     if (!s->h263_aic) {
00045         if (n < 4)
00046             level = block[0] * s->y_dc_scale;
00047         else
00048             level = block[0] * s->c_dc_scale;
00049         qadd = (qscale - 1) | 1;
00050     }else{
00051         qadd = 0;
00052         level= block[0];
00053     }
00054     if(s->ac_pred)
00055         nCoeffs=63;
00056     else
00057         nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00058 
00059 __asm__ volatile(
00060                 "movd %1, %%mm6                 \n\t" 
00061                 "packssdw %%mm6, %%mm6          \n\t"
00062                 "packssdw %%mm6, %%mm6          \n\t"
00063                 "movd %2, %%mm5                 \n\t" 
00064                 "pxor %%mm7, %%mm7              \n\t"
00065                 "packssdw %%mm5, %%mm5          \n\t"
00066                 "packssdw %%mm5, %%mm5          \n\t"
00067                 "psubw %%mm5, %%mm7             \n\t"
00068                 "pxor %%mm4, %%mm4              \n\t"
00069                 ".p2align 4                     \n\t"
00070                 "1:                             \n\t"
00071                 "movq (%0, %3), %%mm0           \n\t"
00072                 "movq 8(%0, %3), %%mm1          \n\t"
00073 
00074                 "pmullw %%mm6, %%mm0            \n\t"
00075                 "pmullw %%mm6, %%mm1            \n\t"
00076 
00077                 "movq (%0, %3), %%mm2           \n\t"
00078                 "movq 8(%0, %3), %%mm3          \n\t"
00079 
00080                 "pcmpgtw %%mm4, %%mm2           \n\t" 
00081                 "pcmpgtw %%mm4, %%mm3           \n\t" 
00082 
00083                 "pxor %%mm2, %%mm0              \n\t"
00084                 "pxor %%mm3, %%mm1              \n\t"
00085 
00086                 "paddw %%mm7, %%mm0             \n\t"
00087                 "paddw %%mm7, %%mm1             \n\t"
00088 
00089                 "pxor %%mm0, %%mm2              \n\t"
00090                 "pxor %%mm1, %%mm3              \n\t"
00091 
00092                 "pcmpeqw %%mm7, %%mm0           \n\t" 
00093                 "pcmpeqw %%mm7, %%mm1           \n\t" 
00094 
00095                 "pandn %%mm2, %%mm0             \n\t"
00096                 "pandn %%mm3, %%mm1             \n\t"
00097 
00098                 "movq %%mm0, (%0, %3)           \n\t"
00099                 "movq %%mm1, 8(%0, %3)          \n\t"
00100 
00101                 "add $16, %3                    \n\t"
00102                 "jng 1b                         \n\t"
00103                 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00104                 : "memory"
00105         );
00106         block[0]= level;
00107 }
00108 
00109 
00110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00111                                   DCTELEM *block, int n, int qscale)
00112 {
00113     x86_reg qmul, qadd, nCoeffs;
00114 
00115     qmul = qscale << 1;
00116     qadd = (qscale - 1) | 1;
00117 
00118     assert(s->block_last_index[n]>=0 || s->h263_aic);
00119 
00120     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00121 
00122 __asm__ volatile(
00123                 "movd %1, %%mm6                 \n\t" 
00124                 "packssdw %%mm6, %%mm6          \n\t"
00125                 "packssdw %%mm6, %%mm6          \n\t"
00126                 "movd %2, %%mm5                 \n\t" 
00127                 "pxor %%mm7, %%mm7              \n\t"
00128                 "packssdw %%mm5, %%mm5          \n\t"
00129                 "packssdw %%mm5, %%mm5          \n\t"
00130                 "psubw %%mm5, %%mm7             \n\t"
00131                 "pxor %%mm4, %%mm4              \n\t"
00132                 ".p2align 4                     \n\t"
00133                 "1:                             \n\t"
00134                 "movq (%0, %3), %%mm0           \n\t"
00135                 "movq 8(%0, %3), %%mm1          \n\t"
00136 
00137                 "pmullw %%mm6, %%mm0            \n\t"
00138                 "pmullw %%mm6, %%mm1            \n\t"
00139 
00140                 "movq (%0, %3), %%mm2           \n\t"
00141                 "movq 8(%0, %3), %%mm3          \n\t"
00142 
00143                 "pcmpgtw %%mm4, %%mm2           \n\t" 
00144                 "pcmpgtw %%mm4, %%mm3           \n\t" 
00145 
00146                 "pxor %%mm2, %%mm0              \n\t"
00147                 "pxor %%mm3, %%mm1              \n\t"
00148 
00149                 "paddw %%mm7, %%mm0             \n\t"
00150                 "paddw %%mm7, %%mm1             \n\t"
00151 
00152                 "pxor %%mm0, %%mm2              \n\t"
00153                 "pxor %%mm1, %%mm3              \n\t"
00154 
00155                 "pcmpeqw %%mm7, %%mm0           \n\t" 
00156                 "pcmpeqw %%mm7, %%mm1           \n\t" 
00157 
00158                 "pandn %%mm2, %%mm0             \n\t"
00159                 "pandn %%mm3, %%mm1             \n\t"
00160 
00161                 "movq %%mm0, (%0, %3)           \n\t"
00162                 "movq %%mm1, 8(%0, %3)          \n\t"
00163 
00164                 "add $16, %3                    \n\t"
00165                 "jng 1b                         \n\t"
00166                 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00167                 : "memory"
00168         );
00169 }
00170 
00171 
00172 
00173 
00174 
00175 
00176 
00177 
00178 
00179 
00180 
00181 
00182 
00183 
00184 
00185 
00186 
00187 
00188 
00189 
00190 
00191 
00192 
00193 
00194 
00195 
00196 
00197 
00198 
00199 
00200 
00201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00202                                      DCTELEM *block, int n, int qscale)
00203 {
00204     x86_reg nCoeffs;
00205     const uint16_t *quant_matrix;
00206     int block0;
00207 
00208     assert(s->block_last_index[n]>=0);
00209 
00210     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00211 
00212     if (n < 4)
00213         block0 = block[0] * s->y_dc_scale;
00214     else
00215         block0 = block[0] * s->c_dc_scale;
00216     
00217     quant_matrix = s->intra_matrix;
00218 __asm__ volatile(
00219                 "pcmpeqw %%mm7, %%mm7           \n\t"
00220                 "psrlw $15, %%mm7               \n\t"
00221                 "movd %2, %%mm6                 \n\t"
00222                 "packssdw %%mm6, %%mm6          \n\t"
00223                 "packssdw %%mm6, %%mm6          \n\t"
00224                 "mov %3, %%"REG_a"              \n\t"
00225                 ".p2align 4                     \n\t"
00226                 "1:                             \n\t"
00227                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00228                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00229                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00230                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00231                 "pmullw %%mm6, %%mm4            \n\t" 
00232                 "pmullw %%mm6, %%mm5            \n\t" 
00233                 "pxor %%mm2, %%mm2              \n\t"
00234                 "pxor %%mm3, %%mm3              \n\t"
00235                 "pcmpgtw %%mm0, %%mm2           \n\t" 
00236                 "pcmpgtw %%mm1, %%mm3           \n\t" 
00237                 "pxor %%mm2, %%mm0              \n\t"
00238                 "pxor %%mm3, %%mm1              \n\t"
00239                 "psubw %%mm2, %%mm0             \n\t" 
00240                 "psubw %%mm3, %%mm1             \n\t" 
00241                 "pmullw %%mm4, %%mm0            \n\t" 
00242                 "pmullw %%mm5, %%mm1            \n\t" 
00243                 "pxor %%mm4, %%mm4              \n\t"
00244                 "pxor %%mm5, %%mm5              \n\t" 
00245                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" 
00246                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" 
00247                 "psraw $3, %%mm0                \n\t"
00248                 "psraw $3, %%mm1                \n\t"
00249                 "psubw %%mm7, %%mm0             \n\t"
00250                 "psubw %%mm7, %%mm1             \n\t"
00251                 "por %%mm7, %%mm0               \n\t"
00252                 "por %%mm7, %%mm1               \n\t"
00253                 "pxor %%mm2, %%mm0              \n\t"
00254                 "pxor %%mm3, %%mm1              \n\t"
00255                 "psubw %%mm2, %%mm0             \n\t"
00256                 "psubw %%mm3, %%mm1             \n\t"
00257                 "pandn %%mm0, %%mm4             \n\t"
00258                 "pandn %%mm1, %%mm5             \n\t"
00259                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00260                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00261 
00262                 "add $16, %%"REG_a"             \n\t"
00263                 "js 1b                          \n\t"
00264                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00265                 : "%"REG_a, "memory"
00266         );
00267     block[0]= block0;
00268 }
00269 
00270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00271                                      DCTELEM *block, int n, int qscale)
00272 {
00273     x86_reg nCoeffs;
00274     const uint16_t *quant_matrix;
00275 
00276     assert(s->block_last_index[n]>=0);
00277 
00278     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00279 
00280         quant_matrix = s->inter_matrix;
00281 __asm__ volatile(
00282                 "pcmpeqw %%mm7, %%mm7           \n\t"
00283                 "psrlw $15, %%mm7               \n\t"
00284                 "movd %2, %%mm6                 \n\t"
00285                 "packssdw %%mm6, %%mm6          \n\t"
00286                 "packssdw %%mm6, %%mm6          \n\t"
00287                 "mov %3, %%"REG_a"              \n\t"
00288                 ".p2align 4                     \n\t"
00289                 "1:                             \n\t"
00290                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00291                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00292                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00293                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00294                 "pmullw %%mm6, %%mm4            \n\t" 
00295                 "pmullw %%mm6, %%mm5            \n\t" 
00296                 "pxor %%mm2, %%mm2              \n\t"
00297                 "pxor %%mm3, %%mm3              \n\t"
00298                 "pcmpgtw %%mm0, %%mm2           \n\t" 
00299                 "pcmpgtw %%mm1, %%mm3           \n\t" 
00300                 "pxor %%mm2, %%mm0              \n\t"
00301                 "pxor %%mm3, %%mm1              \n\t"
00302                 "psubw %%mm2, %%mm0             \n\t" 
00303                 "psubw %%mm3, %%mm1             \n\t" 
00304                 "paddw %%mm0, %%mm0             \n\t" 
00305                 "paddw %%mm1, %%mm1             \n\t" 
00306                 "paddw %%mm7, %%mm0             \n\t" 
00307                 "paddw %%mm7, %%mm1             \n\t" 
00308                 "pmullw %%mm4, %%mm0            \n\t" 
00309                 "pmullw %%mm5, %%mm1            \n\t" 
00310                 "pxor %%mm4, %%mm4              \n\t"
00311                 "pxor %%mm5, %%mm5              \n\t" 
00312                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" 
00313                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" 
00314                 "psraw $4, %%mm0                \n\t"
00315                 "psraw $4, %%mm1                \n\t"
00316                 "psubw %%mm7, %%mm0             \n\t"
00317                 "psubw %%mm7, %%mm1             \n\t"
00318                 "por %%mm7, %%mm0               \n\t"
00319                 "por %%mm7, %%mm1               \n\t"
00320                 "pxor %%mm2, %%mm0              \n\t"
00321                 "pxor %%mm3, %%mm1              \n\t"
00322                 "psubw %%mm2, %%mm0             \n\t"
00323                 "psubw %%mm3, %%mm1             \n\t"
00324                 "pandn %%mm0, %%mm4             \n\t"
00325                 "pandn %%mm1, %%mm5             \n\t"
00326                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00327                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00328 
00329                 "add $16, %%"REG_a"             \n\t"
00330                 "js 1b                          \n\t"
00331                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00332                 : "%"REG_a, "memory"
00333         );
00334 }
00335 
00336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00337                                      DCTELEM *block, int n, int qscale)
00338 {
00339     x86_reg nCoeffs;
00340     const uint16_t *quant_matrix;
00341     int block0;
00342 
00343     assert(s->block_last_index[n]>=0);
00344 
00345     if(s->alternate_scan) nCoeffs= 63; 
00346     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00347 
00348     if (n < 4)
00349         block0 = block[0] * s->y_dc_scale;
00350     else
00351         block0 = block[0] * s->c_dc_scale;
00352     quant_matrix = s->intra_matrix;
00353 __asm__ volatile(
00354                 "pcmpeqw %%mm7, %%mm7           \n\t"
00355                 "psrlw $15, %%mm7               \n\t"
00356                 "movd %2, %%mm6                 \n\t"
00357                 "packssdw %%mm6, %%mm6          \n\t"
00358                 "packssdw %%mm6, %%mm6          \n\t"
00359                 "mov %3, %%"REG_a"              \n\t"
00360                 ".p2align 4                     \n\t"
00361                 "1:                             \n\t"
00362                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00363                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00364                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00365                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00366                 "pmullw %%mm6, %%mm4            \n\t" 
00367                 "pmullw %%mm6, %%mm5            \n\t" 
00368                 "pxor %%mm2, %%mm2              \n\t"
00369                 "pxor %%mm3, %%mm3              \n\t"
00370                 "pcmpgtw %%mm0, %%mm2           \n\t" 
00371                 "pcmpgtw %%mm1, %%mm3           \n\t" 
00372                 "pxor %%mm2, %%mm0              \n\t"
00373                 "pxor %%mm3, %%mm1              \n\t"
00374                 "psubw %%mm2, %%mm0             \n\t" 
00375                 "psubw %%mm3, %%mm1             \n\t" 
00376                 "pmullw %%mm4, %%mm0            \n\t" 
00377                 "pmullw %%mm5, %%mm1            \n\t" 
00378                 "pxor %%mm4, %%mm4              \n\t"
00379                 "pxor %%mm5, %%mm5              \n\t" 
00380                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" 
00381                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" 
00382                 "psraw $3, %%mm0                \n\t"
00383                 "psraw $3, %%mm1                \n\t"
00384                 "pxor %%mm2, %%mm0              \n\t"
00385                 "pxor %%mm3, %%mm1              \n\t"
00386                 "psubw %%mm2, %%mm0             \n\t"
00387                 "psubw %%mm3, %%mm1             \n\t"
00388                 "pandn %%mm0, %%mm4             \n\t"
00389                 "pandn %%mm1, %%mm5             \n\t"
00390                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00391                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00392 
00393                 "add $16, %%"REG_a"             \n\t"
00394                 "jng 1b                         \n\t"
00395                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00396                 : "%"REG_a, "memory"
00397         );
00398     block[0]= block0;
00399         
00400 }
00401 
00402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00403                                      DCTELEM *block, int n, int qscale)
00404 {
00405     x86_reg nCoeffs;
00406     const uint16_t *quant_matrix;
00407 
00408     assert(s->block_last_index[n]>=0);
00409 
00410     if(s->alternate_scan) nCoeffs= 63; 
00411     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00412 
00413         quant_matrix = s->inter_matrix;
00414 __asm__ volatile(
00415                 "pcmpeqw %%mm7, %%mm7           \n\t"
00416                 "psrlq $48, %%mm7               \n\t"
00417                 "movd %2, %%mm6                 \n\t"
00418                 "packssdw %%mm6, %%mm6          \n\t"
00419                 "packssdw %%mm6, %%mm6          \n\t"
00420                 "mov %3, %%"REG_a"              \n\t"
00421                 ".p2align 4                     \n\t"
00422                 "1:                             \n\t"
00423                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00424                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00425                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00426                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00427                 "pmullw %%mm6, %%mm4            \n\t" 
00428                 "pmullw %%mm6, %%mm5            \n\t" 
00429                 "pxor %%mm2, %%mm2              \n\t"
00430                 "pxor %%mm3, %%mm3              \n\t"
00431                 "pcmpgtw %%mm0, %%mm2           \n\t" 
00432                 "pcmpgtw %%mm1, %%mm3           \n\t" 
00433                 "pxor %%mm2, %%mm0              \n\t"
00434                 "pxor %%mm3, %%mm1              \n\t"
00435                 "psubw %%mm2, %%mm0             \n\t" 
00436                 "psubw %%mm3, %%mm1             \n\t" 
00437                 "paddw %%mm0, %%mm0             \n\t" 
00438                 "paddw %%mm1, %%mm1             \n\t" 
00439                 "pmullw %%mm4, %%mm0            \n\t" 
00440                 "pmullw %%mm5, %%mm1            \n\t" 
00441                 "paddw %%mm4, %%mm0             \n\t" 
00442                 "paddw %%mm5, %%mm1             \n\t" 
00443                 "pxor %%mm4, %%mm4              \n\t"
00444                 "pxor %%mm5, %%mm5              \n\t" 
00445                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" 
00446                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" 
00447                 "psrlw $4, %%mm0                \n\t"
00448                 "psrlw $4, %%mm1                \n\t"
00449                 "pxor %%mm2, %%mm0              \n\t"
00450                 "pxor %%mm3, %%mm1              \n\t"
00451                 "psubw %%mm2, %%mm0             \n\t"
00452                 "psubw %%mm3, %%mm1             \n\t"
00453                 "pandn %%mm0, %%mm4             \n\t"
00454                 "pandn %%mm1, %%mm5             \n\t"
00455                 "pxor %%mm4, %%mm7              \n\t"
00456                 "pxor %%mm5, %%mm7              \n\t"
00457                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00458                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00459 
00460                 "add $16, %%"REG_a"             \n\t"
00461                 "jng 1b                         \n\t"
00462                 "movd 124(%0, %3), %%mm0        \n\t"
00463                 "movq %%mm7, %%mm6              \n\t"
00464                 "psrlq $32, %%mm7               \n\t"
00465                 "pxor %%mm6, %%mm7              \n\t"
00466                 "movq %%mm7, %%mm6              \n\t"
00467                 "psrlq $16, %%mm7               \n\t"
00468                 "pxor %%mm6, %%mm7              \n\t"
00469                 "pslld $31, %%mm7               \n\t"
00470                 "psrlq $15, %%mm7               \n\t"
00471                 "pxor %%mm7, %%mm0              \n\t"
00472                 "movd %%mm0, 124(%0, %3)        \n\t"
00473 
00474                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
00475                 : "%"REG_a, "memory"
00476         );
00477 }
00478 
00479 static void  denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00480     const int intra= s->mb_intra;
00481     int *sum= s->dct_error_sum[intra];
00482     uint16_t *offset= s->dct_offset[intra];
00483 
00484     s->dct_count[intra]++;
00485 
00486     __asm__ volatile(
00487         "pxor %%mm7, %%mm7                      \n\t"
00488         "1:                                     \n\t"
00489         "pxor %%mm0, %%mm0                      \n\t"
00490         "pxor %%mm1, %%mm1                      \n\t"
00491         "movq (%0), %%mm2                       \n\t"
00492         "movq 8(%0), %%mm3                      \n\t"
00493         "pcmpgtw %%mm2, %%mm0                   \n\t"
00494         "pcmpgtw %%mm3, %%mm1                   \n\t"
00495         "pxor %%mm0, %%mm2                      \n\t"
00496         "pxor %%mm1, %%mm3                      \n\t"
00497         "psubw %%mm0, %%mm2                     \n\t"
00498         "psubw %%mm1, %%mm3                     \n\t"
00499         "movq %%mm2, %%mm4                      \n\t"
00500         "movq %%mm3, %%mm5                      \n\t"
00501         "psubusw (%2), %%mm2                    \n\t"
00502         "psubusw 8(%2), %%mm3                   \n\t"
00503         "pxor %%mm0, %%mm2                      \n\t"
00504         "pxor %%mm1, %%mm3                      \n\t"
00505         "psubw %%mm0, %%mm2                     \n\t"
00506         "psubw %%mm1, %%mm3                     \n\t"
00507         "movq %%mm2, (%0)                       \n\t"
00508         "movq %%mm3, 8(%0)                      \n\t"
00509         "movq %%mm4, %%mm2                      \n\t"
00510         "movq %%mm5, %%mm3                      \n\t"
00511         "punpcklwd %%mm7, %%mm4                 \n\t"
00512         "punpckhwd %%mm7, %%mm2                 \n\t"
00513         "punpcklwd %%mm7, %%mm5                 \n\t"
00514         "punpckhwd %%mm7, %%mm3                 \n\t"
00515         "paddd (%1), %%mm4                      \n\t"
00516         "paddd 8(%1), %%mm2                     \n\t"
00517         "paddd 16(%1), %%mm5                    \n\t"
00518         "paddd 24(%1), %%mm3                    \n\t"
00519         "movq %%mm4, (%1)                       \n\t"
00520         "movq %%mm2, 8(%1)                      \n\t"
00521         "movq %%mm5, 16(%1)                     \n\t"
00522         "movq %%mm3, 24(%1)                     \n\t"
00523         "add $16, %0                            \n\t"
00524         "add $32, %1                            \n\t"
00525         "add $16, %2                            \n\t"
00526         "cmp %3, %0                             \n\t"
00527             " jb 1b                             \n\t"
00528         : "+r" (block), "+r" (sum), "+r" (offset)
00529         : "r"(block+64)
00530     );
00531 }
00532 
00533 static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00534     const int intra= s->mb_intra;
00535     int *sum= s->dct_error_sum[intra];
00536     uint16_t *offset= s->dct_offset[intra];
00537 
00538     s->dct_count[intra]++;
00539 
00540     __asm__ volatile(
00541         "pxor %%xmm7, %%xmm7                    \n\t"
00542         "1:                                     \n\t"
00543         "pxor %%xmm0, %%xmm0                    \n\t"
00544         "pxor %%xmm1, %%xmm1                    \n\t"
00545         "movdqa (%0), %%xmm2                    \n\t"
00546         "movdqa 16(%0), %%xmm3                  \n\t"
00547         "pcmpgtw %%xmm2, %%xmm0                 \n\t"
00548         "pcmpgtw %%xmm3, %%xmm1                 \n\t"
00549         "pxor %%xmm0, %%xmm2                    \n\t"
00550         "pxor %%xmm1, %%xmm3                    \n\t"
00551         "psubw %%xmm0, %%xmm2                   \n\t"
00552         "psubw %%xmm1, %%xmm3                   \n\t"
00553         "movdqa %%xmm2, %%xmm4                  \n\t"
00554         "movdqa %%xmm3, %%xmm5                  \n\t"
00555         "psubusw (%2), %%xmm2                   \n\t"
00556         "psubusw 16(%2), %%xmm3                 \n\t"
00557         "pxor %%xmm0, %%xmm2                    \n\t"
00558         "pxor %%xmm1, %%xmm3                    \n\t"
00559         "psubw %%xmm0, %%xmm2                   \n\t"
00560         "psubw %%xmm1, %%xmm3                   \n\t"
00561         "movdqa %%xmm2, (%0)                    \n\t"
00562         "movdqa %%xmm3, 16(%0)                  \n\t"
00563         "movdqa %%xmm4, %%xmm6                  \n\t"
00564         "movdqa %%xmm5, %%xmm0                  \n\t"
00565         "punpcklwd %%xmm7, %%xmm4               \n\t"
00566         "punpckhwd %%xmm7, %%xmm6               \n\t"
00567         "punpcklwd %%xmm7, %%xmm5               \n\t"
00568         "punpckhwd %%xmm7, %%xmm0               \n\t"
00569         "paddd (%1), %%xmm4                     \n\t"
00570         "paddd 16(%1), %%xmm6                   \n\t"
00571         "paddd 32(%1), %%xmm5                   \n\t"
00572         "paddd 48(%1), %%xmm0                   \n\t"
00573         "movdqa %%xmm4, (%1)                    \n\t"
00574         "movdqa %%xmm6, 16(%1)                  \n\t"
00575         "movdqa %%xmm5, 32(%1)                  \n\t"
00576         "movdqa %%xmm0, 48(%1)                  \n\t"
00577         "add $32, %0                            \n\t"
00578         "add $64, %1                            \n\t"
00579         "add $32, %2                            \n\t"
00580         "cmp %3, %0                             \n\t"
00581             " jb 1b                             \n\t"
00582         : "+r" (block), "+r" (sum), "+r" (offset)
00583         : "r"(block+64)
00584           XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
00585                             "%xmm4", "%xmm5", "%xmm6", "%xmm7")
00586     );
00587 }
00588 
00589 #if HAVE_SSSE3
00590 #define HAVE_SSSE3_BAK
00591 #endif
00592 #undef HAVE_SSSE3
00593 #define HAVE_SSSE3 0
00594 
00595 #undef HAVE_SSE2
00596 #undef HAVE_MMX2
00597 #define HAVE_SSE2 0
00598 #define HAVE_MMX2 0
00599 #define RENAME(a) a ## _MMX
00600 #define RENAMEl(a) a ## _mmx
00601 #include "mpegvideo_mmx_template.c"
00602 
00603 #undef HAVE_MMX2
00604 #define HAVE_MMX2 1
00605 #undef RENAME
00606 #undef RENAMEl
00607 #define RENAME(a) a ## _MMX2
00608 #define RENAMEl(a) a ## _mmx2
00609 #include "mpegvideo_mmx_template.c"
00610 
00611 #undef HAVE_SSE2
00612 #define HAVE_SSE2 1
00613 #undef RENAME
00614 #undef RENAMEl
00615 #define RENAME(a) a ## _SSE2
00616 #define RENAMEl(a) a ## _sse2
00617 #include "mpegvideo_mmx_template.c"
00618 
00619 #ifdef HAVE_SSSE3_BAK
00620 #undef HAVE_SSSE3
00621 #define HAVE_SSSE3 1
00622 #undef RENAME
00623 #undef RENAMEl
00624 #define RENAME(a) a ## _SSSE3
00625 #define RENAMEl(a) a ## _sse2
00626 #include "mpegvideo_mmx_template.c"
00627 #endif
00628 
00629 void ff_MPV_common_init_mmx(MpegEncContext *s)
00630 {
00631     int mm_flags = av_get_cpu_flags();
00632 
00633     if (mm_flags & AV_CPU_FLAG_MMX) {
00634         const int dct_algo = s->avctx->dct_algo;
00635 
00636         s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00637         s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00638         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00639         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00640         if(!(s->flags & CODEC_FLAG_BITEXACT))
00641             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00642         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00643 
00644         if (mm_flags & AV_CPU_FLAG_SSE2) {
00645             s->denoise_dct= denoise_dct_sse2;
00646         } else {
00647                 s->denoise_dct= denoise_dct_mmx;
00648         }
00649 
00650         if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00651 #if HAVE_SSSE3
00652             if(mm_flags & AV_CPU_FLAG_SSSE3){
00653                 s->dct_quantize= dct_quantize_SSSE3;
00654             } else
00655 #endif
00656             if(mm_flags & AV_CPU_FLAG_SSE2){
00657                 s->dct_quantize= dct_quantize_SSE2;
00658             } else if(mm_flags & AV_CPU_FLAG_MMX2){
00659                 s->dct_quantize= dct_quantize_MMX2;
00660             } else {
00661                 s->dct_quantize= dct_quantize_MMX;
00662             }
00663         }
00664     }
00665 }