00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024
00025 DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
00026
00027 #ifdef EMULATE_3DNOWEXT
00028 #define PSWAPD(s,d)\
00029 "movq "#s","#d"\n"\
00030 "psrlq $32,"#d"\n"\
00031 "punpckldq "#s","#d"\n"
00032 #define ff_fft_calc_3dn2 ff_fft_calc_3dn
00033 #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
00034 #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
00035 #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
00036 #define ff_imdct_half_3dn2 ff_imdct_half_3dn
00037 #else
00038 #define PSWAPD(s,d) "pswapd "#s","#d"\n"
00039 #endif
00040
00041 void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
00042 void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
00043
00044 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
00045 {
00046 int n = 1<<s->nbits;
00047 int i;
00048 ff_fft_dispatch_interleave_3dn2(z, s->nbits);
00049 __asm__ volatile("femms");
00050 if(n <= 8)
00051 for(i=0; i<n; i+=2)
00052 FFSWAP(FFTSample, z[i].im, z[i+1].re);
00053 }
00054
00055 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
00056 {
00057 x86_reg j, k;
00058 long n = 1 << s->nbits;
00059 long n2 = n >> 1;
00060 long n4 = n >> 2;
00061 long n8 = n >> 3;
00062 const uint16_t *revtab = s->fft.revtab;
00063 const FFTSample *tcos = s->tcos;
00064 const FFTSample *tsin = s->tsin;
00065 const FFTSample *in1, *in2;
00066 FFTComplex *z = (FFTComplex *)output;
00067
00068
00069 in1 = input;
00070 in2 = input + n2 - 1;
00071 #ifdef EMULATE_3DNOWEXT
00072 __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31));
00073 #endif
00074 for(k = 0; k < n4; k++) {
00075
00076 __asm__ volatile(
00077 "movd %0, %%mm0 \n"
00078 "movd %2, %%mm1 \n"
00079 "punpckldq %1, %%mm0 \n"
00080 "punpckldq %3, %%mm1 \n"
00081 "movq %%mm0, %%mm2 \n"
00082 PSWAPD( %%mm1, %%mm3 )
00083 "pfmul %%mm1, %%mm0 \n"
00084 "pfmul %%mm3, %%mm2 \n"
00085 #ifdef EMULATE_3DNOWEXT
00086 "movq %%mm0, %%mm1 \n"
00087 "punpckhdq %%mm2, %%mm0 \n"
00088 "punpckldq %%mm2, %%mm1 \n"
00089 "pxor %%mm7, %%mm0 \n"
00090 "pfadd %%mm1, %%mm0 \n"
00091 #else
00092 "pfpnacc %%mm2, %%mm0 \n"
00093 #endif
00094 ::"m"(in2[-2*k]), "m"(in1[2*k]),
00095 "m"(tcos[k]), "m"(tsin[k])
00096 );
00097 __asm__ volatile(
00098 "movq %%mm0, %0 \n\t"
00099 :"=m"(z[revtab[k]])
00100 );
00101 }
00102
00103 ff_fft_dispatch_3dn2(z, s->fft.nbits);
00104
00105 #define CMUL(j,mm0,mm1)\
00106 "movq (%2,"#j",2), %%mm6 \n"\
00107 "movq 8(%2,"#j",2), "#mm0"\n"\
00108 "movq %%mm6, "#mm1"\n"\
00109 "movq "#mm0",%%mm7 \n"\
00110 "pfmul (%3,"#j"), %%mm6 \n"\
00111 "pfmul (%4,"#j"), "#mm0"\n"\
00112 "pfmul (%4,"#j"), "#mm1"\n"\
00113 "pfmul (%3,"#j"), %%mm7 \n"\
00114 "pfsub %%mm6, "#mm0"\n"\
00115 "pfadd %%mm7, "#mm1"\n"
00116
00117
00118 j = -n2;
00119 k = n2-8;
00120 __asm__ volatile(
00121 "1: \n"
00122 CMUL(%0, %%mm0, %%mm1)
00123 CMUL(%1, %%mm2, %%mm3)
00124 "movd %%mm0, (%2,%0,2) \n"
00125 "movd %%mm1,12(%2,%1,2) \n"
00126 "movd %%mm2, (%2,%1,2) \n"
00127 "movd %%mm3,12(%2,%0,2) \n"
00128 "psrlq $32, %%mm0 \n"
00129 "psrlq $32, %%mm1 \n"
00130 "psrlq $32, %%mm2 \n"
00131 "psrlq $32, %%mm3 \n"
00132 "movd %%mm0, 8(%2,%0,2) \n"
00133 "movd %%mm1, 4(%2,%1,2) \n"
00134 "movd %%mm2, 8(%2,%1,2) \n"
00135 "movd %%mm3, 4(%2,%0,2) \n"
00136 "sub $8, %1 \n"
00137 "add $8, %0 \n"
00138 "jl 1b \n"
00139 :"+r"(j), "+r"(k)
00140 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
00141 :"memory"
00142 );
00143 __asm__ volatile("femms");
00144 }
00145
00146 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
00147 {
00148 x86_reg j, k;
00149 long n = 1 << s->nbits;
00150 long n4 = n >> 2;
00151
00152 ff_imdct_half_3dn2(s, output+n4, input);
00153
00154 j = -n;
00155 k = n-8;
00156 __asm__ volatile(
00157 "movq %4, %%mm7 \n"
00158 "1: \n"
00159 PSWAPD((%2,%1), %%mm0)
00160 PSWAPD((%3,%0), %%mm1)
00161 "pxor %%mm7, %%mm0 \n"
00162 "movq %%mm1, (%3,%1) \n"
00163 "movq %%mm0, (%2,%0) \n"
00164 "sub $8, %1 \n"
00165 "add $8, %0 \n"
00166 "jl 1b \n"
00167 :"+r"(j), "+r"(k)
00168 :"r"(output+n4), "r"(output+n4*3),
00169 "m"(*m1m1)
00170 );
00171 __asm__ volatile("femms");
00172 }
00173