00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026 
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00033                                float *tmpbuf);
00034 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00035                                float *tmpbuf);
00036 
00037 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00038 
00039 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00040 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00041 
00042 #define SUM8(op, sum, w, p)               \
00043 {                                         \
00044     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
00045     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
00046     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
00047     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
00048     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
00049     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
00050     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
00051     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
00052 }
00053 
00054 static void apply_window(const float *buf, const float *win1,
00055                          const float *win2, float *sum1, float *sum2, int len)
00056 {
00057     x86_reg count = - 4*len;
00058     const float *win1a = win1+len;
00059     const float *win2a = win2+len;
00060     const float *bufa  = buf+len;
00061     float *sum1a = sum1+len;
00062     float *sum2a = sum2+len;
00063 
00064 
00065 #define MULT(a, b)                                 \
00066     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
00067     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
00068     "mulps         %%xmm2, %%xmm1           \n\t"  \
00069     "subps         %%xmm1, %%xmm0           \n\t"  \
00070     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
00071     "subps         %%xmm2, %%xmm4           \n\t"  \
00072 
00073     __asm__ volatile(
00074             "1:                                   \n\t"
00075             "xorps       %%xmm0, %%xmm0           \n\t"
00076             "xorps       %%xmm4, %%xmm4           \n\t"
00077 
00078             MULT(   0,   0)
00079             MULT( 256,  64)
00080             MULT( 512, 128)
00081             MULT( 768, 192)
00082             MULT(1024, 256)
00083             MULT(1280, 320)
00084             MULT(1536, 384)
00085             MULT(1792, 448)
00086 
00087             "movaps      %%xmm0, (%4,%0)          \n\t"
00088             "movaps      %%xmm4, (%5,%0)          \n\t"
00089             "add            $16,  %0              \n\t"
00090             "jl              1b                   \n\t"
00091             :"+&r"(count)
00092             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00093             );
00094 
00095 #undef MULT
00096 }
00097 
00098 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00099                              int incr)
00100 {
00101     LOCAL_ALIGNED_16(float, suma, [17]);
00102     LOCAL_ALIGNED_16(float, sumb, [17]);
00103     LOCAL_ALIGNED_16(float, sumc, [17]);
00104     LOCAL_ALIGNED_16(float, sumd, [17]);
00105 
00106     float sum;
00107 
00108     
00109     memcpy(in + 512, in, 32 * sizeof(*in));
00110 
00111     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
00112     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00113 
00114     SUM8(MACS, suma[0], win + 32, in + 48);
00115 
00116     sumc[ 0] = 0;
00117     sumb[16] = 0;
00118     sumd[16] = 0;
00119 
00120 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
00121             "movups " #sumd "(%4),       %%xmm0          \n\t" \
00122             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00123             "subps  " #suma "(%1),       %%xmm0          \n\t" \
00124             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
00125 \
00126             "movups " #sumc "(%3),       %%xmm0          \n\t" \
00127             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00128             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
00129             "movaps        %%xmm0," #out2 "(%0)          \n\t"
00130 
00131     if (incr == 1) {
00132         __asm__ volatile(
00133             SUMS( 0, 48,  4, 52,  0, 112)
00134             SUMS(16, 32, 20, 36, 16,  96)
00135             SUMS(32, 16, 36, 20, 32,  80)
00136             SUMS(48,  0, 52,  4, 48,  64)
00137 
00138             :"+&r"(out)
00139             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00140             :"memory"
00141             );
00142         out += 16*incr;
00143     } else {
00144         int j;
00145         float *out2 = out + 32 * incr;
00146         out[0  ]  = -suma[   0];
00147         out += incr;
00148         out2 -= incr;
00149         for(j=1;j<16;j++) {
00150             *out  = -suma[   j] + sumd[16-j];
00151             *out2 =  sumb[16-j] + sumc[   j];
00152             out  += incr;
00153             out2 -= incr;
00154         }
00155     }
00156 
00157     sum = 0;
00158     SUM8(MLSS, sum, win + 16 + 32, in + 32);
00159     *out = sum;
00160 }
00161 
00162 
00163 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
00164 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
00165                                int count, int switch_point, int block_type) \
00166 {                                                                           \
00167     int align_end = count - (count & 3);                                \
00168     int j;                                                              \
00169     for (j = 0; j < align_end; j+= 4) {                                 \
00170         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
00171         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
00172                        \
00173                                                                         \
00174                                                      \
00175         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
00176         in      += 4*18;                                                \
00177         buf     += 4*18;                                                \
00178         out     += 4;                                                   \
00179     }                                                                   \
00180     for (; j < count; j++) {                                            \
00181                        \
00182                                                                         \
00183                                                      \
00184         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
00185         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
00186                                                                         \
00187         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
00188                                                                         \
00189         in  += 18;                                                      \
00190         buf++;                                                          \
00191         out++;                                                          \
00192     }                                                                   \
00193 }
00194 
00195 #if HAVE_YASM
00196 #if HAVE_SSE
00197 DECL_IMDCT_BLOCKS(sse,sse)
00198 DECL_IMDCT_BLOCKS(sse2,sse)
00199 DECL_IMDCT_BLOCKS(sse3,sse)
00200 DECL_IMDCT_BLOCKS(ssse3,sse)
00201 #endif
00202 #if HAVE_AVX
00203 DECL_IMDCT_BLOCKS(avx,avx)
00204 #endif
00205 #endif
00206 
00207 void ff_mpadsp_init_mmx(MPADSPContext *s)
00208 {
00209     int mm_flags = av_get_cpu_flags();
00210 
00211     int i, j;
00212     for (j = 0; j < 4; j++) {
00213         for (i = 0; i < 40; i ++) {
00214             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
00215             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00216             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00217             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00218             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
00219             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
00220             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00221             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00222         }
00223     }
00224 
00225     if (mm_flags & AV_CPU_FLAG_SSE2) {
00226         s->apply_window_float = apply_window_mp3;
00227     }
00228 #if HAVE_YASM
00229     if (0) {
00230 #if HAVE_AVX
00231     } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00232         s->imdct36_blocks_float = imdct36_blocks_avx;
00233 #endif
00234 #if HAVE_SSE
00235     } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
00236         s->imdct36_blocks_float = imdct36_blocks_ssse3;
00237     } else if (mm_flags & AV_CPU_FLAG_SSE3) {
00238         s->imdct36_blocks_float = imdct36_blocks_sse3;
00239     } else if (mm_flags & AV_CPU_FLAG_SSE2) {
00240         s->imdct36_blocks_float = imdct36_blocks_sse2;
00241     } else if (mm_flags & AV_CPU_FLAG_SSE) {
00242         s->imdct36_blocks_float = imdct36_blocks_sse;
00243 #endif 
00244     }
00245 #endif 
00246 }