00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "libavutil/common.h"
00024 #include "libavcodec/dsputil.h"
00025
00026 #include "mmx.h"
00027
00028 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
00029
00030 #define ROW_SHIFT 11
00031 #define COL_SHIFT 6
00032
00033 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
00034 #define rounder(bias) {round (bias), round (bias)}
00035
00036
00037 #if 0
00038
00039 static inline void idct_row (int16_t * row, int offset,
00040 int16_t * table, int32_t * rounder)
00041 {
00042 int C1, C2, C3, C4, C5, C6, C7;
00043 int a0, a1, a2, a3, b0, b1, b2, b3;
00044
00045 row += offset;
00046
00047 C1 = table[1];
00048 C2 = table[2];
00049 C3 = table[3];
00050 C4 = table[4];
00051 C5 = table[5];
00052 C6 = table[6];
00053 C7 = table[7];
00054
00055 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
00056 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
00057 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
00058 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
00059
00060 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00061 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00062 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00063 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00064
00065 row[0] = (a0 + b0) >> ROW_SHIFT;
00066 row[1] = (a1 + b1) >> ROW_SHIFT;
00067 row[2] = (a2 + b2) >> ROW_SHIFT;
00068 row[3] = (a3 + b3) >> ROW_SHIFT;
00069 row[4] = (a3 - b3) >> ROW_SHIFT;
00070 row[5] = (a2 - b2) >> ROW_SHIFT;
00071 row[6] = (a1 - b1) >> ROW_SHIFT;
00072 row[7] = (a0 - b0) >> ROW_SHIFT;
00073 }
00074 #endif
00075
00076
00077
00078
00079 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
00080 c4, c6, c4, c6, \
00081 c1, c3, -c1, -c5, \
00082 c5, c7, c3, -c7, \
00083 c4, -c6, c4, -c6, \
00084 -c4, c2, c4, -c2, \
00085 c5, -c1, c3, -c1, \
00086 c7, c3, c7, -c5 }
00087
00088 static inline void mmxext_row_head (int16_t * const row, const int offset,
00089 const int16_t * const table)
00090 {
00091 movq_m2r (*(row+offset), mm2);
00092
00093 movq_m2r (*(row+offset+4), mm5);
00094 movq_r2r (mm2, mm0);
00095
00096 movq_m2r (*table, mm3);
00097 movq_r2r (mm5, mm6);
00098
00099 movq_m2r (*(table+4), mm4);
00100 pmaddwd_r2r (mm0, mm3);
00101
00102 pshufw_r2r (mm2, mm2, 0x4e);
00103 }
00104
00105 static inline void mmxext_row (const int16_t * const table,
00106 const int32_t * const rounder)
00107 {
00108 movq_m2r (*(table+8), mm1);
00109 pmaddwd_r2r (mm2, mm4);
00110
00111 pmaddwd_m2r (*(table+16), mm0);
00112 pshufw_r2r (mm6, mm6, 0x4e);
00113
00114 movq_m2r (*(table+12), mm7);
00115 pmaddwd_r2r (mm5, mm1);
00116
00117 paddd_m2r (*rounder, mm3);
00118 pmaddwd_r2r (mm6, mm7);
00119
00120 pmaddwd_m2r (*(table+20), mm2);
00121 paddd_r2r (mm4, mm3);
00122
00123 pmaddwd_m2r (*(table+24), mm5);
00124 movq_r2r (mm3, mm4);
00125
00126 pmaddwd_m2r (*(table+28), mm6);
00127 paddd_r2r (mm7, mm1);
00128
00129 paddd_m2r (*rounder, mm0);
00130 psubd_r2r (mm1, mm3);
00131
00132 psrad_i2r (ROW_SHIFT, mm3);
00133 paddd_r2r (mm4, mm1);
00134
00135 paddd_r2r (mm2, mm0);
00136 psrad_i2r (ROW_SHIFT, mm1);
00137
00138 paddd_r2r (mm6, mm5);
00139 movq_r2r (mm0, mm4);
00140
00141 paddd_r2r (mm5, mm0);
00142 psubd_r2r (mm5, mm4);
00143 }
00144
00145 static inline void mmxext_row_tail (int16_t * const row, const int store)
00146 {
00147 psrad_i2r (ROW_SHIFT, mm0);
00148
00149 psrad_i2r (ROW_SHIFT, mm4);
00150
00151 packssdw_r2r (mm0, mm1);
00152
00153 packssdw_r2r (mm3, mm4);
00154
00155 movq_r2m (mm1, *(row+store));
00156 pshufw_r2r (mm4, mm4, 0xb1);
00157
00158
00159
00160 movq_r2m (mm4, *(row+store+4));
00161 }
00162
00163 static inline void mmxext_row_mid (int16_t * const row, const int store,
00164 const int offset,
00165 const int16_t * const table)
00166 {
00167 movq_m2r (*(row+offset), mm2);
00168 psrad_i2r (ROW_SHIFT, mm0);
00169
00170 movq_m2r (*(row+offset+4), mm5);
00171 psrad_i2r (ROW_SHIFT, mm4);
00172
00173 packssdw_r2r (mm0, mm1);
00174 movq_r2r (mm5, mm6);
00175
00176 packssdw_r2r (mm3, mm4);
00177 movq_r2r (mm2, mm0);
00178
00179 movq_r2m (mm1, *(row+store));
00180 pshufw_r2r (mm4, mm4, 0xb1);
00181
00182 movq_m2r (*table, mm3);
00183 movq_r2m (mm4, *(row+store+4));
00184
00185 pmaddwd_r2r (mm0, mm3);
00186
00187 movq_m2r (*(table+4), mm4);
00188 pshufw_r2r (mm2, mm2, 0x4e);
00189 }
00190
00191
00192
00193
00194 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
00195 c4, c6, -c4, -c2, \
00196 c1, c3, c3, -c7, \
00197 c5, c7, -c1, -c5, \
00198 c4, -c6, c4, -c2, \
00199 -c4, c2, c4, -c6, \
00200 c5, -c1, c7, -c5, \
00201 c7, c3, c3, -c1 }
00202
00203 static inline void mmx_row_head (int16_t * const row, const int offset,
00204 const int16_t * const table)
00205 {
00206 movq_m2r (*(row+offset), mm2);
00207
00208 movq_m2r (*(row+offset+4), mm5);
00209 movq_r2r (mm2, mm0);
00210
00211 movq_m2r (*table, mm3);
00212 movq_r2r (mm5, mm6);
00213
00214 punpckldq_r2r (mm0, mm0);
00215
00216 movq_m2r (*(table+4), mm4);
00217 pmaddwd_r2r (mm0, mm3);
00218
00219 movq_m2r (*(table+8), mm1);
00220 punpckhdq_r2r (mm2, mm2);
00221 }
00222
00223 static inline void mmx_row (const int16_t * const table,
00224 const int32_t * const rounder)
00225 {
00226 pmaddwd_r2r (mm2, mm4);
00227 punpckldq_r2r (mm5, mm5);
00228
00229 pmaddwd_m2r (*(table+16), mm0);
00230 punpckhdq_r2r (mm6, mm6);
00231
00232 movq_m2r (*(table+12), mm7);
00233 pmaddwd_r2r (mm5, mm1);
00234
00235 paddd_m2r (*rounder, mm3);
00236 pmaddwd_r2r (mm6, mm7);
00237
00238 pmaddwd_m2r (*(table+20), mm2);
00239 paddd_r2r (mm4, mm3);
00240
00241 pmaddwd_m2r (*(table+24), mm5);
00242 movq_r2r (mm3, mm4);
00243
00244 pmaddwd_m2r (*(table+28), mm6);
00245 paddd_r2r (mm7, mm1);
00246
00247 paddd_m2r (*rounder, mm0);
00248 psubd_r2r (mm1, mm3);
00249
00250 psrad_i2r (ROW_SHIFT, mm3);
00251 paddd_r2r (mm4, mm1);
00252
00253 paddd_r2r (mm2, mm0);
00254 psrad_i2r (ROW_SHIFT, mm1);
00255
00256 paddd_r2r (mm6, mm5);
00257 movq_r2r (mm0, mm7);
00258
00259 paddd_r2r (mm5, mm0);
00260 psubd_r2r (mm5, mm7);
00261 }
00262
00263 static inline void mmx_row_tail (int16_t * const row, const int store)
00264 {
00265 psrad_i2r (ROW_SHIFT, mm0);
00266
00267 psrad_i2r (ROW_SHIFT, mm7);
00268
00269 packssdw_r2r (mm0, mm1);
00270
00271 packssdw_r2r (mm3, mm7);
00272
00273 movq_r2m (mm1, *(row+store));
00274 movq_r2r (mm7, mm4);
00275
00276 pslld_i2r (16, mm7);
00277
00278 psrld_i2r (16, mm4);
00279
00280 por_r2r (mm4, mm7);
00281
00282
00283
00284 movq_r2m (mm7, *(row+store+4));
00285 }
00286
00287 static inline void mmx_row_mid (int16_t * const row, const int store,
00288 const int offset, const int16_t * const table)
00289 {
00290 movq_m2r (*(row+offset), mm2);
00291 psrad_i2r (ROW_SHIFT, mm0);
00292
00293 movq_m2r (*(row+offset+4), mm5);
00294 psrad_i2r (ROW_SHIFT, mm7);
00295
00296 packssdw_r2r (mm0, mm1);
00297 movq_r2r (mm5, mm6);
00298
00299 packssdw_r2r (mm3, mm7);
00300 movq_r2r (mm2, mm0);
00301
00302 movq_r2m (mm1, *(row+store));
00303 movq_r2r (mm7, mm1);
00304
00305 punpckldq_r2r (mm0, mm0);
00306 psrld_i2r (16, mm7);
00307
00308 movq_m2r (*table, mm3);
00309 pslld_i2r (16, mm1);
00310
00311 movq_m2r (*(table+4), mm4);
00312 por_r2r (mm1, mm7);
00313
00314 movq_m2r (*(table+8), mm1);
00315 punpckhdq_r2r (mm2, mm2);
00316
00317 movq_r2m (mm7, *(row+store+4));
00318 pmaddwd_r2r (mm0, mm3);
00319 }
00320
00321
00322 #if 0
00323
00324 static inline void idct_col (int16_t * col, int offset)
00325 {
00326
00327 #define F(c,x) (((c) * (x)) >> 16)
00328
00329
00330 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
00331
00332 int16_t x0, x1, x2, x3, x4, x5, x6, x7;
00333 int16_t y0, y1, y2, y3, y4, y5, y6, y7;
00334 int16_t a0, a1, a2, a3, b0, b1, b2, b3;
00335 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
00336
00337 col += offset;
00338
00339 x0 = col[0*8];
00340 x1 = col[1*8];
00341 x2 = col[2*8];
00342 x3 = col[3*8];
00343 x4 = col[4*8];
00344 x5 = col[5*8];
00345 x6 = col[6*8];
00346 x7 = col[7*8];
00347
00348 u04 = S (x0 + x4);
00349 v04 = S (x0 - x4);
00350 u26 = S (F (T2, x6) + x2);
00351 v26 = S (F (T2, x2) - x6);
00352
00353 a0 = S (u04 + u26);
00354 a1 = S (v04 + v26);
00355 a2 = S (v04 - v26);
00356 a3 = S (u04 - u26);
00357
00358 u17 = S (F (T1, x7) + x1);
00359 v17 = S (F (T1, x1) - x7);
00360 u35 = S (F (T3, x5) + x3);
00361 v35 = S (F (T3, x3) - x5);
00362
00363 b0 = S (u17 + u35);
00364 b3 = S (v17 - v35);
00365 u12 = S (u17 - u35);
00366 v12 = S (v17 + v35);
00367 u12 = S (2 * F (C4, u12));
00368 v12 = S (2 * F (C4, v12));
00369 b1 = S (u12 + v12);
00370 b2 = S (u12 - v12);
00371
00372 y0 = S (a0 + b0) >> COL_SHIFT;
00373 y1 = S (a1 + b1) >> COL_SHIFT;
00374 y2 = S (a2 + b2) >> COL_SHIFT;
00375 y3 = S (a3 + b3) >> COL_SHIFT;
00376
00377 y4 = S (a3 - b3) >> COL_SHIFT;
00378 y5 = S (a2 - b2) >> COL_SHIFT;
00379 y6 = S (a1 - b1) >> COL_SHIFT;
00380 y7 = S (a0 - b0) >> COL_SHIFT;
00381
00382 col[0*8] = y0;
00383 col[1*8] = y1;
00384 col[2*8] = y2;
00385 col[3*8] = y3;
00386 col[4*8] = y4;
00387 col[5*8] = y5;
00388 col[6*8] = y6;
00389 col[7*8] = y7;
00390 }
00391 #endif
00392
00393
00394
00395 static inline void idct_col (int16_t * const col, const int offset)
00396 {
00397 #define T1 13036
00398 #define T2 27146
00399 #define T3 43790
00400 #define C4 23170
00401
00402 static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
00403 static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
00404 static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
00405 static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
00406
00407
00408
00409
00410 movq_m2r (*t1_vector, mm0);
00411
00412 movq_m2r (*(col+offset+1*8), mm1);
00413 movq_r2r (mm0, mm2);
00414
00415 movq_m2r (*(col+offset+7*8), mm4);
00416 pmulhw_r2r (mm1, mm0);
00417
00418 movq_m2r (*t3_vector, mm5);
00419 pmulhw_r2r (mm4, mm2);
00420
00421 movq_m2r (*(col+offset+5*8), mm6);
00422 movq_r2r (mm5, mm7);
00423
00424 movq_m2r (*(col+offset+3*8), mm3);
00425 psubsw_r2r (mm4, mm0);
00426
00427 movq_m2r (*t2_vector, mm4);
00428 pmulhw_r2r (mm3, mm5);
00429
00430 paddsw_r2r (mm2, mm1);
00431 pmulhw_r2r (mm6, mm7);
00432
00433
00434
00435 movq_r2r (mm4, mm2);
00436 paddsw_r2r (mm3, mm5);
00437
00438 pmulhw_m2r (*(col+offset+2*8), mm4);
00439 paddsw_r2r (mm6, mm7);
00440
00441 psubsw_r2r (mm6, mm5);
00442 paddsw_r2r (mm3, mm7);
00443
00444 movq_m2r (*(col+offset+6*8), mm3);
00445 movq_r2r (mm0, mm6);
00446
00447 pmulhw_r2r (mm3, mm2);
00448 psubsw_r2r (mm5, mm0);
00449
00450 psubsw_r2r (mm3, mm4);
00451 paddsw_r2r (mm6, mm5);
00452
00453 movq_r2m (mm0, *(col+offset+3*8));
00454 movq_r2r (mm1, mm6);
00455
00456 paddsw_m2r (*(col+offset+2*8), mm2);
00457 paddsw_r2r (mm7, mm6);
00458
00459 psubsw_r2r (mm7, mm1);
00460 movq_r2r (mm1, mm7);
00461
00462 movq_m2r (*(col+offset+0*8), mm3);
00463 paddsw_r2r (mm5, mm1);
00464
00465 movq_m2r (*c4_vector, mm0);
00466 psubsw_r2r (mm5, mm7);
00467
00468 movq_r2m (mm6, *(col+offset+5*8));
00469 pmulhw_r2r (mm0, mm1);
00470
00471 movq_r2r (mm4, mm6);
00472 pmulhw_r2r (mm0, mm7);
00473
00474 movq_m2r (*(col+offset+4*8), mm5);
00475 movq_r2r (mm3, mm0);
00476
00477 psubsw_r2r (mm5, mm3);
00478 paddsw_r2r (mm5, mm0);
00479
00480 paddsw_r2r (mm3, mm4);
00481 movq_r2r (mm0, mm5);
00482
00483 psubsw_r2r (mm6, mm3);
00484 paddsw_r2r (mm2, mm5);
00485
00486 paddsw_r2r (mm1, mm1);
00487 psubsw_r2r (mm2, mm0);
00488
00489 paddsw_r2r (mm7, mm7);
00490 movq_r2r (mm3, mm2);
00491
00492 movq_r2r (mm4, mm6);
00493 paddsw_r2r (mm7, mm3);
00494
00495 psraw_i2r (COL_SHIFT, mm3);
00496 paddsw_r2r (mm1, mm4);
00497
00498 psraw_i2r (COL_SHIFT, mm4);
00499 psubsw_r2r (mm1, mm6);
00500
00501 movq_m2r (*(col+offset+5*8), mm1);
00502 psubsw_r2r (mm7, mm2);
00503
00504 psraw_i2r (COL_SHIFT, mm6);
00505 movq_r2r (mm5, mm7);
00506
00507 movq_r2m (mm4, *(col+offset+1*8));
00508 psraw_i2r (COL_SHIFT, mm2);
00509
00510 movq_r2m (mm3, *(col+offset+2*8));
00511 paddsw_r2r (mm1, mm5);
00512
00513 movq_m2r (*(col+offset+3*8), mm4);
00514 psubsw_r2r (mm1, mm7);
00515
00516 psraw_i2r (COL_SHIFT, mm5);
00517 movq_r2r (mm0, mm3);
00518
00519 movq_r2m (mm2, *(col+offset+5*8));
00520 psubsw_r2r (mm4, mm3);
00521
00522 psraw_i2r (COL_SHIFT, mm7);
00523 paddsw_r2r (mm0, mm4);
00524
00525 movq_r2m (mm5, *(col+offset+0*8));
00526 psraw_i2r (COL_SHIFT, mm3);
00527
00528 movq_r2m (mm6, *(col+offset+6*8));
00529 psraw_i2r (COL_SHIFT, mm4);
00530
00531 movq_r2m (mm7, *(col+offset+7*8));
00532
00533 movq_r2m (mm3, *(col+offset+4*8));
00534
00535 movq_r2m (mm4, *(col+offset+3*8));
00536
00537 #undef T1
00538 #undef T2
00539 #undef T3
00540 #undef C4
00541 }
00542
00543
00544 static const int32_t rounder0[] ATTR_ALIGN(8) =
00545 rounder ((1 << (COL_SHIFT - 1)) - 0.5);
00546 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
00547 static const int32_t rounder1[] ATTR_ALIGN(8) =
00548 rounder (1.25683487303);
00549 static const int32_t rounder7[] ATTR_ALIGN(8) =
00550 rounder (-0.25);
00551 static const int32_t rounder2[] ATTR_ALIGN(8) =
00552 rounder (0.60355339059);
00553 static const int32_t rounder6[] ATTR_ALIGN(8) =
00554 rounder (-0.25);
00555 static const int32_t rounder3[] ATTR_ALIGN(8) =
00556 rounder (0.087788325588);
00557 static const int32_t rounder5[] ATTR_ALIGN(8) =
00558 rounder (-0.441341716183);
00559
00560 #undef COL_SHIFT
00561 #undef ROW_SHIFT
00562
00563 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
00564 void idct (int16_t * const block) \
00565 { \
00566 static const int16_t table04[] ATTR_ALIGN(16) = \
00567 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
00568 static const int16_t table17[] ATTR_ALIGN(16) = \
00569 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
00570 static const int16_t table26[] ATTR_ALIGN(16) = \
00571 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
00572 static const int16_t table35[] ATTR_ALIGN(16) = \
00573 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
00574 \
00575 idct_row_head (block, 0*8, table04); \
00576 idct_row (table04, rounder0); \
00577 idct_row_mid (block, 0*8, 4*8, table04); \
00578 idct_row (table04, rounder4); \
00579 idct_row_mid (block, 4*8, 1*8, table17); \
00580 idct_row (table17, rounder1); \
00581 idct_row_mid (block, 1*8, 7*8, table17); \
00582 idct_row (table17, rounder7); \
00583 idct_row_mid (block, 7*8, 2*8, table26); \
00584 idct_row (table26, rounder2); \
00585 idct_row_mid (block, 2*8, 6*8, table26); \
00586 idct_row (table26, rounder6); \
00587 idct_row_mid (block, 6*8, 3*8, table35); \
00588 idct_row (table35, rounder3); \
00589 idct_row_mid (block, 3*8, 5*8, table35); \
00590 idct_row (table35, rounder5); \
00591 idct_row_tail (block, 5*8); \
00592 \
00593 idct_col (block, 0); \
00594 idct_col (block, 4); \
00595 }
00596
00597 void ff_mmx_idct(DCTELEM *block);
00598 void ff_mmxext_idct(DCTELEM *block);
00599
00600 declare_idct (ff_mmxext_idct, mmxext_table,
00601 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
00602
00603 declare_idct (ff_mmx_idct, mmx_table,
00604 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
00605