Libav 0.7.1
|
00001 /* 00002 * idct for sh4 00003 * 00004 * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp> 00005 * 00006 * This file is part of Libav. 00007 * 00008 * Libav is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * Libav is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with Libav; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 #include "libavcodec/dsputil.h" 00024 #include "dsputil_sh4.h" 00025 #include "sh4.h" 00026 00027 #define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */ 00028 #define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */ 00029 #define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */ 00030 #define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */ 00031 #define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */ 00032 #define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */ 00033 #define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */ 00034 00035 static const float even_table[] __attribute__ ((aligned(8))) = { 00036 c4, c4, c4, c4, 00037 c2, c6,-c6,-c2, 00038 c4,-c4,-c4, c4, 00039 c6,-c2, c2,-c6 00040 }; 00041 00042 static const float odd_table[] __attribute__ ((aligned(8))) = { 00043 c1, c3, c5, c7, 00044 c3,-c7,-c1,-c5, 00045 c5,-c1, c7, c3, 00046 c7,-c5, c3,-c1 00047 }; 00048 00049 #undef c1 00050 #undef c2 00051 #undef c3 00052 #undef c4 00053 #undef c5 00054 #undef c6 00055 #undef c7 00056 00057 #define load_matrix(table) \ 00058 do { \ 00059 const float *t = table; \ 00060 __asm__ volatile( \ 00061 " fschg\n" \ 00062 " fmov @%0+,xd0\n" \ 00063 " fmov @%0+,xd2\n" \ 00064 " fmov @%0+,xd4\n" \ 00065 " fmov @%0+,xd6\n" \ 00066 " fmov @%0+,xd8\n" \ 00067 " fmov @%0+,xd10\n" \ 00068 " fmov @%0+,xd12\n" \ 00069 " fmov @%0+,xd14\n" \ 00070 " fschg\n" \ 00071 : "+r"(t) \ 00072 ); \ 00073 } while (0) 00074 00075 #define ftrv() \ 00076 __asm__ volatile("ftrv xmtrx,fv0" \ 00077 : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3)); 00078 00079 #define DEFREG \ 00080 register float fr0 __asm__("fr0"); \ 00081 register float fr1 __asm__("fr1"); \ 00082 register float fr2 __asm__("fr2"); \ 00083 register float fr3 __asm__("fr3") 00084 00085 #define DESCALE(x,n) (x)*(1.0f/(1<<(n))) 00086 00087 /* this code work worse on gcc cvs. 3.2.3 work fine */ 00088 00089 00090 //optimized 00091 00092 void idct_sh4(DCTELEM *block) 00093 { 00094 DEFREG; 00095 00096 int i; 00097 float tblock[8*8],*fblock; 00098 int ofs1,ofs2,ofs3; 00099 int fpscr; 00100 00101 fp_single_enter(fpscr); 00102 00103 /* row */ 00104 00105 /* even part */ 00106 load_matrix(even_table); 00107 00108 fblock = tblock+4; 00109 i = 8; 00110 do { 00111 fr0 = block[0]; 00112 fr1 = block[2]; 00113 fr2 = block[4]; 00114 fr3 = block[6]; 00115 block+=8; 00116 ftrv(); 00117 *--fblock = fr3; 00118 *--fblock = fr2; 00119 *--fblock = fr1; 00120 *--fblock = fr0; 00121 fblock+=8+4; 00122 } while(--i); 00123 block-=8*8; 00124 fblock-=8*8+4; 00125 00126 load_matrix(odd_table); 00127 00128 i = 8; 00129 00130 do { 00131 float t0,t1,t2,t3; 00132 fr0 = block[1]; 00133 fr1 = block[3]; 00134 fr2 = block[5]; 00135 fr3 = block[7]; 00136 block+=8; 00137 ftrv(); 00138 t0 = *fblock++; 00139 t1 = *fblock++; 00140 t2 = *fblock++; 00141 t3 = *fblock++; 00142 fblock+=4; 00143 *--fblock = t0 - fr0; 00144 *--fblock = t1 - fr1; 00145 *--fblock = t2 - fr2; 00146 *--fblock = t3 - fr3; 00147 *--fblock = t3 + fr3; 00148 *--fblock = t2 + fr2; 00149 *--fblock = t1 + fr1; 00150 *--fblock = t0 + fr0; 00151 fblock+=8; 00152 } while(--i); 00153 block-=8*8; 00154 fblock-=8*8; 00155 00156 /* col */ 00157 00158 /* even part */ 00159 load_matrix(even_table); 00160 00161 ofs1 = sizeof(float)*2*8; 00162 ofs2 = sizeof(float)*4*8; 00163 ofs3 = sizeof(float)*6*8; 00164 00165 i = 8; 00166 00167 #define OA(fblock,ofs) *(float*)((char*)fblock + ofs) 00168 00169 do { 00170 fr0 = OA(fblock, 0); 00171 fr1 = OA(fblock,ofs1); 00172 fr2 = OA(fblock,ofs2); 00173 fr3 = OA(fblock,ofs3); 00174 ftrv(); 00175 OA(fblock,0 ) = fr0; 00176 OA(fblock,ofs1) = fr1; 00177 OA(fblock,ofs2) = fr2; 00178 OA(fblock,ofs3) = fr3; 00179 fblock++; 00180 } while(--i); 00181 fblock-=8; 00182 00183 load_matrix(odd_table); 00184 00185 i=8; 00186 do { 00187 float t0,t1,t2,t3; 00188 t0 = OA(fblock, 0); /* [8*0] */ 00189 t1 = OA(fblock,ofs1); /* [8*2] */ 00190 t2 = OA(fblock,ofs2); /* [8*4] */ 00191 t3 = OA(fblock,ofs3); /* [8*6] */ 00192 fblock+=8; 00193 fr0 = OA(fblock, 0); /* [8*1] */ 00194 fr1 = OA(fblock,ofs1); /* [8*3] */ 00195 fr2 = OA(fblock,ofs2); /* [8*5] */ 00196 fr3 = OA(fblock,ofs3); /* [8*7] */ 00197 fblock+=-8+1; 00198 ftrv(); 00199 block[8*0] = DESCALE(t0 + fr0,3); 00200 block[8*7] = DESCALE(t0 - fr0,3); 00201 block[8*1] = DESCALE(t1 + fr1,3); 00202 block[8*6] = DESCALE(t1 - fr1,3); 00203 block[8*2] = DESCALE(t2 + fr2,3); 00204 block[8*5] = DESCALE(t2 - fr2,3); 00205 block[8*3] = DESCALE(t3 + fr3,3); 00206 block[8*4] = DESCALE(t3 - fr3,3); 00207 block++; 00208 } while(--i); 00209 00210 fp_single_leave(fpscr); 00211 }