Libav 0.7.1
libavcodec/dct-test.c
Go to the documentation of this file.
00001 /*
00002  * (c) 2001 Fabrice Bellard
00003  *     2007 Marc Hoffman <marc.hoffman@analog.com>
00004  *
00005  * This file is part of Libav.
00006  *
00007  * Libav is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * Libav is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with Libav; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034 
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038 
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045 
00046 #undef printf
00047 
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050 
00051 void odivx_idct_c(short *block);
00052 
00053 // BFIN
00054 void ff_bfin_idct(DCTELEM *block);
00055 void ff_bfin_fdct(DCTELEM *block);
00056 
00057 // ALTIVEC
00058 void fdct_altivec(DCTELEM *block);
00059 //void idct_altivec(DCTELEM *block);?? no routine
00060 
00061 // ARM
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067 
00068 void ff_simple_idct_axp(DCTELEM *data);
00069 
00070 struct algo {
00071   const char *name;
00072   enum { FDCT, IDCT } is_idct;
00073   void (* func) (DCTELEM *block);
00074   void (* ref)  (DCTELEM *block);
00075   enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
00076   int  mm_support;
00077 };
00078 
00079 #ifndef FAAN_POSTSCALE
00080 #define FAAN_SCALE SCALE_PERM
00081 #else
00082 #define FAAN_SCALE NO_PERM
00083 #endif
00084 
00085 static int cpu_flags;
00086 
00087 struct algo algos[] = {
00088   {"REF-DBL",         0, ff_ref_fdct,        ff_ref_fdct, NO_PERM},
00089   {"FAAN",            0, ff_faandct,         ff_ref_fdct, FAAN_SCALE},
00090   {"FAANI",           1, ff_faanidct,        ff_ref_idct, NO_PERM},
00091   {"IJG-AAN-INT",     0, fdct_ifast,         ff_ref_fdct, SCALE_PERM},
00092   {"IJG-LLM-INT",     0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
00093   {"REF-DBL",         1, ff_ref_idct,        ff_ref_idct, NO_PERM},
00094   {"INT",             1, j_rev_dct,          ff_ref_idct, MMX_PERM},
00095   {"SIMPLE-C",        1, ff_simple_idct,     ff_ref_idct, NO_PERM},
00096 
00097 #if HAVE_MMX
00098   {"MMX",             0, ff_fdct_mmx,        ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
00099 #if HAVE_MMX2
00100   {"MMX2",            0, ff_fdct_mmx2,       ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
00101   {"SSE2",            0, ff_fdct_sse2,       ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
00102 #endif
00103 
00104 #if CONFIG_GPL
00105   {"LIBMPEG2-MMX",    1, ff_mmx_idct,        ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
00106   {"LIBMPEG2-MMX2",   1, ff_mmxext_idct,     ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
00107 #endif
00108   {"SIMPLE-MMX",      1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
00109   {"XVID-MMX",        1, ff_idct_xvid_mmx,   ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
00110   {"XVID-MMX2",       1, ff_idct_xvid_mmx2,  ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
00111   {"XVID-SSE2",       1, ff_idct_xvid_sse2,  ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
00112 #endif
00113 
00114 #if HAVE_ALTIVEC
00115   {"altivecfdct",     0, fdct_altivec,       ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
00116 #endif
00117 
00118 #if ARCH_BFIN
00119   {"BFINfdct",        0, ff_bfin_fdct,       ff_ref_fdct, NO_PERM},
00120   {"BFINidct",        1, ff_bfin_idct,       ff_ref_idct, NO_PERM},
00121 #endif
00122 
00123 #if ARCH_ARM
00124   {"SIMPLE-ARM",      1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
00125   {"INT-ARM",         1, ff_j_rev_dct_arm,   ff_ref_idct, MMX_PERM },
00126 #if HAVE_ARMV5TE
00127   {"SIMPLE-ARMV5TE",  1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
00128 #endif
00129 #if HAVE_ARMV6
00130   {"SIMPLE-ARMV6",    1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
00131 #endif
00132 #if HAVE_NEON
00133   {"SIMPLE-NEON",     1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
00134 #endif
00135 #endif /* ARCH_ARM */
00136 
00137 #if ARCH_ALPHA
00138   {"SIMPLE-ALPHA",    1, ff_simple_idct_axp,  ff_ref_idct, NO_PERM },
00139 #endif
00140 
00141   { 0 }
00142 };
00143 
00144 #define AANSCALE_BITS 12
00145 
00146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
00147 
00148 static int64_t gettime(void)
00149 {
00150     struct timeval tv;
00151     gettimeofday(&tv,NULL);
00152     return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00153 }
00154 
00155 #define NB_ITS 20000
00156 #define NB_ITS_SPEED 50000
00157 
00158 static short idct_mmx_perm[64];
00159 
00160 static short idct_simple_mmx_perm[64]={
00161         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00162         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00163         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00164         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00165         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00166         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00167         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00168         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00169 };
00170 
00171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00172 
00173 static void idct_mmx_init(void)
00174 {
00175     int i;
00176 
00177     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
00178     for (i = 0; i < 64; i++) {
00179         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00180 //        idct_simple_mmx_perm[i] = simple_block_permute_op(i);
00181     }
00182 }
00183 
00184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
00187 
00188 static inline void mmx_emms(void)
00189 {
00190 #if HAVE_MMX
00191     if (cpu_flags & AV_CPU_FLAG_MMX)
00192         __asm__ volatile ("emms\n\t");
00193 #endif
00194 }
00195 
00196 static void dct_error(const char *name, int is_idct,
00197                void (*fdct_func)(DCTELEM *block),
00198                void (*fdct_ref)(DCTELEM *block), int form, int test)
00199 {
00200     int it, i, scale;
00201     int err_inf, v;
00202     int64_t err2, ti, ti1, it1;
00203     int64_t sysErr[64], sysErrMax=0;
00204     int maxout=0;
00205     int blockSumErrMax=0, blockSumErr;
00206     AVLFG prng;
00207 
00208     av_lfg_init(&prng, 1);
00209 
00210     err_inf = 0;
00211     err2 = 0;
00212     for(i=0; i<64; i++) sysErr[i]=0;
00213     for(it=0;it<NB_ITS;it++) {
00214         for(i=0;i<64;i++)
00215             block1[i] = 0;
00216         switch(test){
00217         case 0:
00218             for(i=0;i<64;i++)
00219                 block1[i] = (av_lfg_get(&prng) % 512) -256;
00220             if (is_idct){
00221                 ff_ref_fdct(block1);
00222 
00223                 for(i=0;i<64;i++)
00224                     block1[i]>>=3;
00225             }
00226         break;
00227         case 1:{
00228             int num = av_lfg_get(&prng) % 10 + 1;
00229             for(i=0;i<num;i++)
00230                 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
00231         }break;
00232         case 2:
00233             block1[0] = av_lfg_get(&prng) % 4096 - 2048;
00234             block1[63]= (block1[0]&1)^1;
00235         break;
00236         }
00237 
00238 #if 0 // simulate mismatch control
00239 { int sum=0;
00240         for(i=0;i<64;i++)
00241            sum+=block1[i];
00242 
00243         if((sum&1)==0) block1[63]^=1;
00244 }
00245 #endif
00246 
00247         for(i=0; i<64; i++)
00248             block_org[i]= block1[i];
00249 
00250         if (form == MMX_PERM) {
00251             for(i=0;i<64;i++)
00252                 block[idct_mmx_perm[i]] = block1[i];
00253             } else if (form == MMX_SIMPLE_PERM) {
00254             for(i=0;i<64;i++)
00255                 block[idct_simple_mmx_perm[i]] = block1[i];
00256 
00257         } else if (form == SSE2_PERM) {
00258             for(i=0; i<64; i++)
00259                 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
00260         } else if (form == PARTTRANS_PERM) {
00261             for(i=0; i<64; i++)
00262                 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
00263         } else {
00264             for(i=0; i<64; i++)
00265                 block[i]= block1[i];
00266         }
00267 #if 0 // simulate mismatch control for tested IDCT but not the ref
00268 { int sum=0;
00269         for(i=0;i<64;i++)
00270            sum+=block[i];
00271 
00272         if((sum&1)==0) block[63]^=1;
00273 }
00274 #endif
00275 
00276         fdct_func(block);
00277         mmx_emms();
00278 
00279         if (form == SCALE_PERM) {
00280             for(i=0; i<64; i++) {
00281                 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00282                 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
00283             }
00284         }
00285 
00286         fdct_ref(block1);
00287 
00288         blockSumErr=0;
00289         for(i=0;i<64;i++) {
00290             v = abs(block[i] - block1[i]);
00291             if (v > err_inf)
00292                 err_inf = v;
00293             err2 += v * v;
00294             sysErr[i] += block[i] - block1[i];
00295             blockSumErr += v;
00296             if( abs(block[i])>maxout) maxout=abs(block[i]);
00297         }
00298         if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
00299 #if 0 // print different matrix pairs
00300         if(blockSumErr){
00301             printf("\n");
00302             for(i=0; i<64; i++){
00303                 if((i&7)==0) printf("\n");
00304                 printf("%4d ", block_org[i]);
00305             }
00306             for(i=0; i<64; i++){
00307                 if((i&7)==0) printf("\n");
00308                 printf("%4d ", block[i] - block1[i]);
00309             }
00310         }
00311 #endif
00312     }
00313     for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
00314 
00315     for(i=0; i<64; i++){
00316         if(i%8==0) printf("\n");
00317         printf("%7d ", (int)sysErr[i]);
00318     }
00319     printf("\n");
00320 
00321     printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00322            is_idct ? "IDCT" : "DCT",
00323            name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
00324 
00325     /* speed test */
00326     for(i=0;i<64;i++)
00327         block1[i] = 0;
00328     switch(test){
00329     case 0:
00330         for(i=0;i<64;i++)
00331             block1[i] = av_lfg_get(&prng) % 512 -256;
00332         if (is_idct){
00333             ff_ref_fdct(block1);
00334 
00335             for(i=0;i<64;i++)
00336                 block1[i]>>=3;
00337         }
00338     break;
00339     case 1:{
00340     case 2:
00341         block1[0] = av_lfg_get(&prng) % 512 -256;
00342         block1[1] = av_lfg_get(&prng) % 512 -256;
00343         block1[2] = av_lfg_get(&prng) % 512 -256;
00344         block1[3] = av_lfg_get(&prng) % 512 -256;
00345     }break;
00346     }
00347 
00348     if (form == MMX_PERM) {
00349         for(i=0;i<64;i++)
00350             block[idct_mmx_perm[i]] = block1[i];
00351     } else if(form == MMX_SIMPLE_PERM) {
00352         for(i=0;i<64;i++)
00353             block[idct_simple_mmx_perm[i]] = block1[i];
00354     } else {
00355         for(i=0; i<64; i++)
00356             block[i]= block1[i];
00357     }
00358 
00359     ti = gettime();
00360     it1 = 0;
00361     do {
00362         for(it=0;it<NB_ITS_SPEED;it++) {
00363             for(i=0; i<64; i++)
00364                 block[i]= block1[i];
00365 //            memcpy(block, block1, sizeof(DCTELEM) * 64);
00366 // do not memcpy especially not fastmemcpy because it does movntq !!!
00367             fdct_func(block);
00368         }
00369         it1 += NB_ITS_SPEED;
00370         ti1 = gettime() - ti;
00371     } while (ti1 < 1000000);
00372     mmx_emms();
00373 
00374     printf("%s %s: %0.1f kdct/s\n",
00375            is_idct ? "IDCT" : "DCT",
00376            name, (double)it1 * 1000.0 / (double)ti1);
00377 }
00378 
00379 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00380 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00381 
00382 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00383 {
00384     static int init;
00385     static double c8[8][8];
00386     static double c4[4][4];
00387     double block1[64], block2[64], block3[64];
00388     double s, sum, v;
00389     int i, j, k;
00390 
00391     if (!init) {
00392         init = 1;
00393 
00394         for(i=0;i<8;i++) {
00395             sum = 0;
00396             for(j=0;j<8;j++) {
00397                 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
00398                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00399                 sum += c8[i][j] * c8[i][j];
00400             }
00401         }
00402 
00403         for(i=0;i<4;i++) {
00404             sum = 0;
00405             for(j=0;j<4;j++) {
00406                 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
00407                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00408                 sum += c4[i][j] * c4[i][j];
00409             }
00410         }
00411     }
00412 
00413     /* butterfly */
00414     s = 0.5 * sqrt(2.0);
00415     for(i=0;i<4;i++) {
00416         for(j=0;j<8;j++) {
00417             block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
00418             block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
00419         }
00420     }
00421 
00422     /* idct8 on lines */
00423     for(i=0;i<8;i++) {
00424         for(j=0;j<8;j++) {
00425             sum = 0;
00426             for(k=0;k<8;k++)
00427                 sum += c8[k][j] * block1[8*i+k];
00428             block2[8*i+j] = sum;
00429         }
00430     }
00431 
00432     /* idct4 */
00433     for(i=0;i<8;i++) {
00434         for(j=0;j<4;j++) {
00435             /* top */
00436             sum = 0;
00437             for(k=0;k<4;k++)
00438                 sum += c4[k][j] * block2[8*(2*k)+i];
00439             block3[8*(2*j)+i] = sum;
00440 
00441             /* bottom */
00442             sum = 0;
00443             for(k=0;k<4;k++)
00444                 sum += c4[k][j] * block2[8*(2*k+1)+i];
00445             block3[8*(2*j+1)+i] = sum;
00446         }
00447     }
00448 
00449     /* clamp and store the result */
00450     for(i=0;i<8;i++) {
00451         for(j=0;j<8;j++) {
00452             v = block3[8*i+j];
00453             if (v < 0)
00454                 v = 0;
00455             else if (v > 255)
00456                 v = 255;
00457             dest[i * linesize + j] = (int)rint(v);
00458         }
00459     }
00460 }
00461 
00462 static void idct248_error(const char *name,
00463                     void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
00464 {
00465     int it, i, it1, ti, ti1, err_max, v;
00466 
00467     AVLFG prng;
00468 
00469     av_lfg_init(&prng, 1);
00470 
00471     /* just one test to see if code is correct (precision is less
00472        important here) */
00473     err_max = 0;
00474     for(it=0;it<NB_ITS;it++) {
00475 
00476         /* XXX: use forward transform to generate values */
00477         for(i=0;i<64;i++)
00478             block1[i] = av_lfg_get(&prng) % 256 - 128;
00479         block1[0] += 1024;
00480 
00481         for(i=0; i<64; i++)
00482             block[i]= block1[i];
00483         idct248_ref(img_dest1, 8, block);
00484 
00485         for(i=0; i<64; i++)
00486             block[i]= block1[i];
00487         idct248_put(img_dest, 8, block);
00488 
00489         for(i=0;i<64;i++) {
00490             v = abs((int)img_dest[i] - (int)img_dest1[i]);
00491             if (v == 255)
00492                 printf("%d %d\n", img_dest[i], img_dest1[i]);
00493             if (v > err_max)
00494                 err_max = v;
00495         }
00496     }
00497     printf("%s %s: err_inf=%d\n",
00498            1 ? "IDCT248" : "DCT248",
00499            name, err_max);
00500 
00501     ti = gettime();
00502     it1 = 0;
00503     do {
00504         for(it=0;it<NB_ITS_SPEED;it++) {
00505             for(i=0; i<64; i++)
00506                 block[i]= block1[i];
00507 //            memcpy(block, block1, sizeof(DCTELEM) * 64);
00508 // do not memcpy especially not fastmemcpy because it does movntq !!!
00509             idct248_put(img_dest, 8, block);
00510         }
00511         it1 += NB_ITS_SPEED;
00512         ti1 = gettime() - ti;
00513     } while (ti1 < 1000000);
00514     mmx_emms();
00515 
00516     printf("%s %s: %0.1f kdct/s\n",
00517            1 ? "IDCT248" : "DCT248",
00518            name, (double)it1 * 1000.0 / (double)ti1);
00519 }
00520 
00521 static void help(void)
00522 {
00523     printf("dct-test [-i] [<test-number>]\n"
00524            "test-number 0 -> test with random matrixes\n"
00525            "            1 -> test with random sparse matrixes\n"
00526            "            2 -> do 3. test from mpeg4 std\n"
00527            "-i          test IDCT implementations\n"
00528            "-4          test IDCT248 implementations\n");
00529 }
00530 
00531 int main(int argc, char **argv)
00532 {
00533     int test_idct = 0, test_248_dct = 0;
00534     int c,i;
00535     int test=1;
00536     cpu_flags = av_get_cpu_flags();
00537 
00538     ff_ref_dct_init();
00539     idct_mmx_init();
00540 
00541     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
00542     for(i=0;i<MAX_NEG_CROP;i++) {
00543         cropTbl[i] = 0;
00544         cropTbl[i + MAX_NEG_CROP + 256] = 255;
00545     }
00546 
00547     for(;;) {
00548         c = getopt(argc, argv, "ih4");
00549         if (c == -1)
00550             break;
00551         switch(c) {
00552         case 'i':
00553             test_idct = 1;
00554             break;
00555         case '4':
00556             test_248_dct = 1;
00557             break;
00558         default :
00559         case 'h':
00560             help();
00561             return 0;
00562         }
00563     }
00564 
00565     if(optind <argc) test= atoi(argv[optind]);
00566 
00567     printf("ffmpeg DCT/IDCT test\n");
00568 
00569     if (test_248_dct) {
00570         idct248_error("SIMPLE-C", ff_simple_idct248_put);
00571     } else {
00572       for (i=0;algos[i].name;i++)
00573         if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
00574           dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
00575         }
00576     }
00577     return 0;
00578 }