00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024
00025 DECLARE_ASM_CONST(16, int, m1m1m1m1)[4] =
00026 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
00027
00028 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
00029 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
00030
00031 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00032 {
00033 int n = 1 << s->nbits;
00034
00035 ff_fft_dispatch_interleave_sse(z, s->nbits);
00036
00037 if(n <= 16) {
00038 x86_reg i = -8*n;
00039 __asm__ volatile(
00040 "1: \n"
00041 "movaps (%0,%1), %%xmm0 \n"
00042 "movaps %%xmm0, %%xmm1 \n"
00043 "unpcklps 16(%0,%1), %%xmm0 \n"
00044 "unpckhps 16(%0,%1), %%xmm1 \n"
00045 "movaps %%xmm0, (%0,%1) \n"
00046 "movaps %%xmm1, 16(%0,%1) \n"
00047 "add $32, %0 \n"
00048 "jl 1b \n"
00049 :"+r"(i)
00050 :"r"(z+n)
00051 :"memory"
00052 );
00053 }
00054 }
00055
00056 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
00057 {
00058 int n = 1 << s->nbits;
00059 int i;
00060 for(i=0; i<n; i+=2) {
00061 __asm__ volatile(
00062 "movaps %2, %%xmm0 \n"
00063 "movlps %%xmm0, %0 \n"
00064 "movhps %%xmm0, %1 \n"
00065 :"=m"(s->tmp_buf[s->revtab[i]]),
00066 "=m"(s->tmp_buf[s->revtab[i+1]])
00067 :"m"(z[i])
00068 );
00069 }
00070 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
00071 }
00072
00073 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
00074 {
00075 av_unused x86_reg i, j, k, l;
00076 long n = 1 << s->nbits;
00077 long n2 = n >> 1;
00078 long n4 = n >> 2;
00079 long n8 = n >> 3;
00080 const uint16_t *revtab = s->fft.revtab + n8;
00081 const FFTSample *tcos = s->tcos;
00082 const FFTSample *tsin = s->tsin;
00083 FFTComplex *z = (FFTComplex *)output;
00084
00085
00086 for(k=n8-2; k>=0; k-=2) {
00087 __asm__ volatile(
00088 "movaps (%2,%1,2), %%xmm0 \n"
00089 "movaps -16(%2,%0,2), %%xmm1 \n"
00090 "movaps %%xmm0, %%xmm2 \n"
00091 "shufps $0x88, %%xmm1, %%xmm0 \n"
00092 "shufps $0x77, %%xmm2, %%xmm1 \n"
00093 "movlps (%3,%1), %%xmm4 \n"
00094 "movlps (%4,%1), %%xmm5 \n"
00095 "movhps -8(%3,%0), %%xmm4 \n"
00096 "movhps -8(%4,%0), %%xmm5 \n"
00097 "movaps %%xmm0, %%xmm2 \n"
00098 "movaps %%xmm1, %%xmm3 \n"
00099 "mulps %%xmm5, %%xmm0 \n"
00100 "mulps %%xmm4, %%xmm1 \n"
00101 "mulps %%xmm4, %%xmm2 \n"
00102 "mulps %%xmm5, %%xmm3 \n"
00103 "subps %%xmm0, %%xmm1 \n"
00104 "addps %%xmm3, %%xmm2 \n"
00105 "movaps %%xmm1, %%xmm0 \n"
00106 "unpcklps %%xmm2, %%xmm1 \n"
00107 "unpckhps %%xmm2, %%xmm0 \n"
00108 ::"r"(-4*k), "r"(4*k),
00109 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
00110 );
00111 #if ARCH_X86_64
00112
00113
00114 __asm__("movlps %%xmm0, %0 \n"
00115 "movhps %%xmm0, %1 \n"
00116 "movlps %%xmm1, %2 \n"
00117 "movhps %%xmm1, %3 \n"
00118 :"=m"(z[revtab[-k-2]]),
00119 "=m"(z[revtab[-k-1]]),
00120 "=m"(z[revtab[ k ]]),
00121 "=m"(z[revtab[ k+1]])
00122 );
00123 #else
00124 __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]]));
00125 __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]]));
00126 __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]]));
00127 __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));
00128 #endif
00129 }
00130
00131 ff_fft_dispatch_sse(z, s->fft.nbits);
00132
00133
00134
00135 #define CMUL(j,xmm0,xmm1)\
00136 "movaps (%2,"#j",2), %%xmm6 \n"\
00137 "movaps 16(%2,"#j",2), "#xmm0"\n"\
00138 "movaps %%xmm6, "#xmm1"\n"\
00139 "movaps "#xmm0",%%xmm7 \n"\
00140 "mulps (%3,"#j"), %%xmm6 \n"\
00141 "mulps (%4,"#j"), "#xmm0"\n"\
00142 "mulps (%4,"#j"), "#xmm1"\n"\
00143 "mulps (%3,"#j"), %%xmm7 \n"\
00144 "subps %%xmm6, "#xmm0"\n"\
00145 "addps %%xmm7, "#xmm1"\n"
00146
00147 j = -n2;
00148 k = n2-16;
00149 __asm__ volatile(
00150 "1: \n"
00151 CMUL(%0, %%xmm0, %%xmm1)
00152 CMUL(%1, %%xmm4, %%xmm5)
00153 "shufps $0x1b, %%xmm1, %%xmm1 \n"
00154 "shufps $0x1b, %%xmm5, %%xmm5 \n"
00155 "movaps %%xmm4, %%xmm6 \n"
00156 "unpckhps %%xmm1, %%xmm4 \n"
00157 "unpcklps %%xmm1, %%xmm6 \n"
00158 "movaps %%xmm0, %%xmm2 \n"
00159 "unpcklps %%xmm5, %%xmm0 \n"
00160 "unpckhps %%xmm5, %%xmm2 \n"
00161 "movaps %%xmm6, (%2,%1,2) \n"
00162 "movaps %%xmm4, 16(%2,%1,2) \n"
00163 "movaps %%xmm0, (%2,%0,2) \n"
00164 "movaps %%xmm2, 16(%2,%0,2) \n"
00165 "sub $16, %1 \n"
00166 "add $16, %0 \n"
00167 "jl 1b \n"
00168 :"+&r"(j), "+&r"(k)
00169 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
00170 :"memory"
00171 );
00172 }
00173
00174 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
00175 {
00176 x86_reg j, k;
00177 long n = 1 << s->nbits;
00178 long n4 = n >> 2;
00179
00180 ff_imdct_half_sse(s, output+n4, input);
00181
00182 j = -n;
00183 k = n-16;
00184 __asm__ volatile(
00185 "movaps "MANGLE(m1m1m1m1)", %%xmm7 \n"
00186 "1: \n"
00187 "movaps (%2,%1), %%xmm0 \n"
00188 "movaps (%3,%0), %%xmm1 \n"
00189 "shufps $0x1b, %%xmm0, %%xmm0 \n"
00190 "shufps $0x1b, %%xmm1, %%xmm1 \n"
00191 "xorps %%xmm7, %%xmm0 \n"
00192 "movaps %%xmm1, (%3,%1) \n"
00193 "movaps %%xmm0, (%2,%0) \n"
00194 "sub $16, %1 \n"
00195 "add $16, %0 \n"
00196 "jl 1b \n"
00197 :"+r"(j), "+r"(k)
00198 :"r"(output+n4), "r"(output+n4*3)
00199 );
00200 }
00201