Libav 0.7.1
|
00001 /* 00002 * FFT/MDCT transform with SSE optimizations 00003 * Copyright (c) 2008 Loren Merritt 00004 * 00005 * This file is part of Libav. 00006 * 00007 * Libav is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * Libav is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with Libav; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00022 #include "libavutil/x86_cpu.h" 00023 #include "libavcodec/dsputil.h" 00024 #include "fft.h" 00025 #include "config.h" 00026 00027 DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] = 00028 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; 00029 00030 void ff_fft_dispatch_sse(FFTComplex *z, int nbits); 00031 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); 00032 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits); 00033 00034 #if HAVE_AVX 00035 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z) 00036 { 00037 ff_fft_dispatch_interleave_avx(z, s->nbits); 00038 } 00039 #endif 00040 00041 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) 00042 { 00043 int n = 1 << s->nbits; 00044 00045 ff_fft_dispatch_interleave_sse(z, s->nbits); 00046 00047 if(n <= 16) { 00048 x86_reg i = -8*n; 00049 __asm__ volatile( 00050 "1: \n" 00051 "movaps (%0,%1), %%xmm0 \n" 00052 "movaps %%xmm0, %%xmm1 \n" 00053 "unpcklps 16(%0,%1), %%xmm0 \n" 00054 "unpckhps 16(%0,%1), %%xmm1 \n" 00055 "movaps %%xmm0, (%0,%1) \n" 00056 "movaps %%xmm1, 16(%0,%1) \n" 00057 "add $32, %0 \n" 00058 "jl 1b \n" 00059 :"+r"(i) 00060 :"r"(z+n) 00061 :"memory" 00062 ); 00063 } 00064 } 00065 00066 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) 00067 { 00068 int n = 1 << s->nbits; 00069 int i; 00070 for(i=0; i<n; i+=2) { 00071 __asm__ volatile( 00072 "movaps %2, %%xmm0 \n" 00073 "movlps %%xmm0, %0 \n" 00074 "movhps %%xmm0, %1 \n" 00075 :"=m"(s->tmp_buf[s->revtab[i]]), 00076 "=m"(s->tmp_buf[s->revtab[i+1]]) 00077 :"m"(z[i]) 00078 ); 00079 } 00080 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); 00081 } 00082 00083 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) 00084 { 00085 x86_reg j, k; 00086 long n = s->mdct_size; 00087 long n4 = n >> 2; 00088 00089 s->imdct_half(s, output + n4, input); 00090 00091 j = -n; 00092 k = n-16; 00093 __asm__ volatile( 00094 "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n" 00095 "1: \n" 00096 "movaps (%2,%1), %%xmm0 \n" 00097 "movaps (%3,%0), %%xmm1 \n" 00098 "shufps $0x1b, %%xmm0, %%xmm0 \n" 00099 "shufps $0x1b, %%xmm1, %%xmm1 \n" 00100 "xorps %%xmm7, %%xmm0 \n" 00101 "movaps %%xmm1, (%3,%1) \n" 00102 "movaps %%xmm0, (%2,%0) \n" 00103 "sub $16, %1 \n" 00104 "add $16, %0 \n" 00105 "jl 1b \n" 00106 :"+r"(j), "+r"(k) 00107 :"r"(output+n4), "r"(output+n4*3) 00108 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7") 00109 ); 00110 } 00111