Libav 0.7.1
libswscale/x86/rgb2rgb_template.c
Go to the documentation of this file.
00001 /*
00002  * software RGB to RGB converter
00003  * pluralize by software PAL8 to RGB converter
00004  *              software YUV to YUV converter
00005  *              software YUV to RGB converter
00006  * Written by Nick Kurshev.
00007  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
00008  * lot of big-endian byte order fixes by Alex Beregszaszi
00009  *
00010  * This file is part of Libav.
00011  *
00012  * Libav is free software; you can redistribute it and/or
00013  * modify it under the terms of the GNU Lesser General Public
00014  * License as published by the Free Software Foundation; either
00015  * version 2.1 of the License, or (at your option) any later version.
00016  *
00017  * Libav is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00020  * Lesser General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU Lesser General Public
00023  * License along with Libav; if not, write to the Free Software
00024  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00025  */
00026 
00027 #include <stddef.h>
00028 
00029 #undef PREFETCH
00030 #undef MOVNTQ
00031 #undef EMMS
00032 #undef SFENCE
00033 #undef PAVGB
00034 
00035 #if COMPILE_TEMPLATE_AMD3DNOW
00036 #define PREFETCH  "prefetch"
00037 #define PAVGB     "pavgusb"
00038 #elif COMPILE_TEMPLATE_MMX2
00039 #define PREFETCH "prefetchnta"
00040 #define PAVGB     "pavgb"
00041 #else
00042 #define PREFETCH  " # nop"
00043 #endif
00044 
00045 #if COMPILE_TEMPLATE_AMD3DNOW
00046 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
00047 #define EMMS     "femms"
00048 #else
00049 #define EMMS     "emms"
00050 #endif
00051 
00052 #if COMPILE_TEMPLATE_MMX2
00053 #define MOVNTQ "movntq"
00054 #define SFENCE "sfence"
00055 #else
00056 #define MOVNTQ "movq"
00057 #define SFENCE " # nop"
00058 #endif
00059 
00060 #if !COMPILE_TEMPLATE_SSE2
00061 
00062 #if !COMPILE_TEMPLATE_AMD3DNOW
00063 
00064 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
00065 {
00066     uint8_t *dest = dst;
00067     const uint8_t *s = src;
00068     const uint8_t *end;
00069     const uint8_t *mm_end;
00070     end = s + src_size;
00071     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00072     mm_end = end - 23;
00073     __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
00074     while (s < mm_end) {
00075         __asm__ volatile(
00076             PREFETCH"    32%1           \n\t"
00077             "movd          %1, %%mm0    \n\t"
00078             "punpckldq    3%1, %%mm0    \n\t"
00079             "movd         6%1, %%mm1    \n\t"
00080             "punpckldq    9%1, %%mm1    \n\t"
00081             "movd        12%1, %%mm2    \n\t"
00082             "punpckldq   15%1, %%mm2    \n\t"
00083             "movd        18%1, %%mm3    \n\t"
00084             "punpckldq   21%1, %%mm3    \n\t"
00085             "por        %%mm7, %%mm0    \n\t"
00086             "por        %%mm7, %%mm1    \n\t"
00087             "por        %%mm7, %%mm2    \n\t"
00088             "por        %%mm7, %%mm3    \n\t"
00089             MOVNTQ"     %%mm0,   %0     \n\t"
00090             MOVNTQ"     %%mm1,  8%0     \n\t"
00091             MOVNTQ"     %%mm2, 16%0     \n\t"
00092             MOVNTQ"     %%mm3, 24%0"
00093             :"=m"(*dest)
00094             :"m"(*s)
00095             :"memory");
00096         dest += 32;
00097         s += 24;
00098     }
00099     __asm__ volatile(SFENCE:::"memory");
00100     __asm__ volatile(EMMS:::"memory");
00101     while (s < end) {
00102         *dest++ = *s++;
00103         *dest++ = *s++;
00104         *dest++ = *s++;
00105         *dest++ = 255;
00106     }
00107 }
00108 
00109 #define STORE_BGR24_MMX \
00110             "psrlq         $8, %%mm2    \n\t" \
00111             "psrlq         $8, %%mm3    \n\t" \
00112             "psrlq         $8, %%mm6    \n\t" \
00113             "psrlq         $8, %%mm7    \n\t" \
00114             "pand "MANGLE(mask24l)", %%mm0\n\t" \
00115             "pand "MANGLE(mask24l)", %%mm1\n\t" \
00116             "pand "MANGLE(mask24l)", %%mm4\n\t" \
00117             "pand "MANGLE(mask24l)", %%mm5\n\t" \
00118             "pand "MANGLE(mask24h)", %%mm2\n\t" \
00119             "pand "MANGLE(mask24h)", %%mm3\n\t" \
00120             "pand "MANGLE(mask24h)", %%mm6\n\t" \
00121             "pand "MANGLE(mask24h)", %%mm7\n\t" \
00122             "por        %%mm2, %%mm0    \n\t" \
00123             "por        %%mm3, %%mm1    \n\t" \
00124             "por        %%mm6, %%mm4    \n\t" \
00125             "por        %%mm7, %%mm5    \n\t" \
00126  \
00127             "movq       %%mm1, %%mm2    \n\t" \
00128             "movq       %%mm4, %%mm3    \n\t" \
00129             "psllq        $48, %%mm2    \n\t" \
00130             "psllq        $32, %%mm3    \n\t" \
00131             "pand "MANGLE(mask24hh)", %%mm2\n\t" \
00132             "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
00133             "por        %%mm2, %%mm0    \n\t" \
00134             "psrlq        $16, %%mm1    \n\t" \
00135             "psrlq        $32, %%mm4    \n\t" \
00136             "psllq        $16, %%mm5    \n\t" \
00137             "por        %%mm3, %%mm1    \n\t" \
00138             "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
00139             "por        %%mm5, %%mm4    \n\t" \
00140  \
00141             MOVNTQ"     %%mm0,   %0     \n\t" \
00142             MOVNTQ"     %%mm1,  8%0     \n\t" \
00143             MOVNTQ"     %%mm4, 16%0"
00144 
00145 
00146 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00147 {
00148     uint8_t *dest = dst;
00149     const uint8_t *s = src;
00150     const uint8_t *end;
00151     const uint8_t *mm_end;
00152     end = s + src_size;
00153     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00154     mm_end = end - 31;
00155     while (s < mm_end) {
00156         __asm__ volatile(
00157             PREFETCH"    32%1           \n\t"
00158             "movq          %1, %%mm0    \n\t"
00159             "movq         8%1, %%mm1    \n\t"
00160             "movq        16%1, %%mm4    \n\t"
00161             "movq        24%1, %%mm5    \n\t"
00162             "movq       %%mm0, %%mm2    \n\t"
00163             "movq       %%mm1, %%mm3    \n\t"
00164             "movq       %%mm4, %%mm6    \n\t"
00165             "movq       %%mm5, %%mm7    \n\t"
00166             STORE_BGR24_MMX
00167             :"=m"(*dest)
00168             :"m"(*s)
00169             :"memory");
00170         dest += 24;
00171         s += 32;
00172     }
00173     __asm__ volatile(SFENCE:::"memory");
00174     __asm__ volatile(EMMS:::"memory");
00175     while (s < end) {
00176         *dest++ = *s++;
00177         *dest++ = *s++;
00178         *dest++ = *s++;
00179         s++;
00180     }
00181 }
00182 
00183 /*
00184  original by Strepto/Astral
00185  ported to gcc & bugfixed: A'rpi
00186  MMX2, 3DNOW optimization by Nick Kurshev
00187  32-bit C version, and and&add trick by Michael Niedermayer
00188 */
00189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
00190 {
00191     register const uint8_t* s=src;
00192     register uint8_t* d=dst;
00193     register const uint8_t *end;
00194     const uint8_t *mm_end;
00195     end = s + src_size;
00196     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
00197     __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
00198     mm_end = end - 15;
00199     while (s<mm_end) {
00200         __asm__ volatile(
00201             PREFETCH"  32%1         \n\t"
00202             "movq        %1, %%mm0  \n\t"
00203             "movq       8%1, %%mm2  \n\t"
00204             "movq     %%mm0, %%mm1  \n\t"
00205             "movq     %%mm2, %%mm3  \n\t"
00206             "pand     %%mm4, %%mm0  \n\t"
00207             "pand     %%mm4, %%mm2  \n\t"
00208             "paddw    %%mm1, %%mm0  \n\t"
00209             "paddw    %%mm3, %%mm2  \n\t"
00210             MOVNTQ"   %%mm0,  %0    \n\t"
00211             MOVNTQ"   %%mm2, 8%0"
00212             :"=m"(*d)
00213             :"m"(*s)
00214         );
00215         d+=16;
00216         s+=16;
00217     }
00218     __asm__ volatile(SFENCE:::"memory");
00219     __asm__ volatile(EMMS:::"memory");
00220     mm_end = end - 3;
00221     while (s < mm_end) {
00222         register unsigned x= *((const uint32_t *)s);
00223         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
00224         d+=4;
00225         s+=4;
00226     }
00227     if (s < end) {
00228         register unsigned short x= *((const uint16_t *)s);
00229         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
00230     }
00231 }
00232 
00233 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
00234 {
00235     register const uint8_t* s=src;
00236     register uint8_t* d=dst;
00237     register const uint8_t *end;
00238     const uint8_t *mm_end;
00239     end = s + src_size;
00240     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
00241     __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
00242     __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
00243     mm_end = end - 15;
00244     while (s<mm_end) {
00245         __asm__ volatile(
00246             PREFETCH"  32%1         \n\t"
00247             "movq        %1, %%mm0  \n\t"
00248             "movq       8%1, %%mm2  \n\t"
00249             "movq     %%mm0, %%mm1  \n\t"
00250             "movq     %%mm2, %%mm3  \n\t"
00251             "psrlq       $1, %%mm0  \n\t"
00252             "psrlq       $1, %%mm2  \n\t"
00253             "pand     %%mm7, %%mm0  \n\t"
00254             "pand     %%mm7, %%mm2  \n\t"
00255             "pand     %%mm6, %%mm1  \n\t"
00256             "pand     %%mm6, %%mm3  \n\t"
00257             "por      %%mm1, %%mm0  \n\t"
00258             "por      %%mm3, %%mm2  \n\t"
00259             MOVNTQ"   %%mm0,  %0    \n\t"
00260             MOVNTQ"   %%mm2, 8%0"
00261             :"=m"(*d)
00262             :"m"(*s)
00263         );
00264         d+=16;
00265         s+=16;
00266     }
00267     __asm__ volatile(SFENCE:::"memory");
00268     __asm__ volatile(EMMS:::"memory");
00269     mm_end = end - 3;
00270     while (s < mm_end) {
00271         register uint32_t x= *((const uint32_t*)s);
00272         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
00273         s+=4;
00274         d+=4;
00275     }
00276     if (s < end) {
00277         register uint16_t x= *((const uint16_t*)s);
00278         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
00279     }
00280 }
00281 
00282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
00283 {
00284     const uint8_t *s = src;
00285     const uint8_t *end;
00286     const uint8_t *mm_end;
00287     uint16_t *d = (uint16_t *)dst;
00288     end = s + src_size;
00289     mm_end = end - 15;
00290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
00291     __asm__ volatile(
00292         "movq           %3, %%mm5   \n\t"
00293         "movq           %4, %%mm6   \n\t"
00294         "movq           %5, %%mm7   \n\t"
00295         "jmp 2f                     \n\t"
00296         ".p2align        4          \n\t"
00297         "1:                         \n\t"
00298         PREFETCH"   32(%1)          \n\t"
00299         "movd         (%1), %%mm0   \n\t"
00300         "movd        4(%1), %%mm3   \n\t"
00301         "punpckldq   8(%1), %%mm0   \n\t"
00302         "punpckldq  12(%1), %%mm3   \n\t"
00303         "movq        %%mm0, %%mm1   \n\t"
00304         "movq        %%mm3, %%mm4   \n\t"
00305         "pand        %%mm6, %%mm0   \n\t"
00306         "pand        %%mm6, %%mm3   \n\t"
00307         "pmaddwd     %%mm7, %%mm0   \n\t"
00308         "pmaddwd     %%mm7, %%mm3   \n\t"
00309         "pand        %%mm5, %%mm1   \n\t"
00310         "pand        %%mm5, %%mm4   \n\t"
00311         "por         %%mm1, %%mm0   \n\t"
00312         "por         %%mm4, %%mm3   \n\t"
00313         "psrld          $5, %%mm0   \n\t"
00314         "pslld         $11, %%mm3   \n\t"
00315         "por         %%mm3, %%mm0   \n\t"
00316         MOVNTQ"      %%mm0, (%0)    \n\t"
00317         "add           $16,  %1     \n\t"
00318         "add            $8,  %0     \n\t"
00319         "2:                         \n\t"
00320         "cmp            %2,  %1     \n\t"
00321         " jb            1b          \n\t"
00322         : "+r" (d), "+r"(s)
00323         : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
00324     );
00325 #else
00326     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00327     __asm__ volatile(
00328         "movq    %0, %%mm7    \n\t"
00329         "movq    %1, %%mm6    \n\t"
00330         ::"m"(red_16mask),"m"(green_16mask));
00331     while (s < mm_end) {
00332         __asm__ volatile(
00333             PREFETCH"    32%1           \n\t"
00334             "movd          %1, %%mm0    \n\t"
00335             "movd         4%1, %%mm3    \n\t"
00336             "punpckldq    8%1, %%mm0    \n\t"
00337             "punpckldq   12%1, %%mm3    \n\t"
00338             "movq       %%mm0, %%mm1    \n\t"
00339             "movq       %%mm0, %%mm2    \n\t"
00340             "movq       %%mm3, %%mm4    \n\t"
00341             "movq       %%mm3, %%mm5    \n\t"
00342             "psrlq         $3, %%mm0    \n\t"
00343             "psrlq         $3, %%mm3    \n\t"
00344             "pand          %2, %%mm0    \n\t"
00345             "pand          %2, %%mm3    \n\t"
00346             "psrlq         $5, %%mm1    \n\t"
00347             "psrlq         $5, %%mm4    \n\t"
00348             "pand       %%mm6, %%mm1    \n\t"
00349             "pand       %%mm6, %%mm4    \n\t"
00350             "psrlq         $8, %%mm2    \n\t"
00351             "psrlq         $8, %%mm5    \n\t"
00352             "pand       %%mm7, %%mm2    \n\t"
00353             "pand       %%mm7, %%mm5    \n\t"
00354             "por        %%mm1, %%mm0    \n\t"
00355             "por        %%mm4, %%mm3    \n\t"
00356             "por        %%mm2, %%mm0    \n\t"
00357             "por        %%mm5, %%mm3    \n\t"
00358             "psllq        $16, %%mm3    \n\t"
00359             "por        %%mm3, %%mm0    \n\t"
00360             MOVNTQ"     %%mm0, %0       \n\t"
00361             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00362         d += 4;
00363         s += 16;
00364     }
00365 #endif
00366     __asm__ volatile(SFENCE:::"memory");
00367     __asm__ volatile(EMMS:::"memory");
00368     while (s < end) {
00369         register int rgb = *(const uint32_t*)s; s += 4;
00370         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
00371     }
00372 }
00373 
00374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
00375 {
00376     const uint8_t *s = src;
00377     const uint8_t *end;
00378     const uint8_t *mm_end;
00379     uint16_t *d = (uint16_t *)dst;
00380     end = s + src_size;
00381     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00382     __asm__ volatile(
00383         "movq          %0, %%mm7    \n\t"
00384         "movq          %1, %%mm6    \n\t"
00385         ::"m"(red_16mask),"m"(green_16mask));
00386     mm_end = end - 15;
00387     while (s < mm_end) {
00388         __asm__ volatile(
00389             PREFETCH"    32%1           \n\t"
00390             "movd          %1, %%mm0    \n\t"
00391             "movd         4%1, %%mm3    \n\t"
00392             "punpckldq    8%1, %%mm0    \n\t"
00393             "punpckldq   12%1, %%mm3    \n\t"
00394             "movq       %%mm0, %%mm1    \n\t"
00395             "movq       %%mm0, %%mm2    \n\t"
00396             "movq       %%mm3, %%mm4    \n\t"
00397             "movq       %%mm3, %%mm5    \n\t"
00398             "psllq         $8, %%mm0    \n\t"
00399             "psllq         $8, %%mm3    \n\t"
00400             "pand       %%mm7, %%mm0    \n\t"
00401             "pand       %%mm7, %%mm3    \n\t"
00402             "psrlq         $5, %%mm1    \n\t"
00403             "psrlq         $5, %%mm4    \n\t"
00404             "pand       %%mm6, %%mm1    \n\t"
00405             "pand       %%mm6, %%mm4    \n\t"
00406             "psrlq        $19, %%mm2    \n\t"
00407             "psrlq        $19, %%mm5    \n\t"
00408             "pand          %2, %%mm2    \n\t"
00409             "pand          %2, %%mm5    \n\t"
00410             "por        %%mm1, %%mm0    \n\t"
00411             "por        %%mm4, %%mm3    \n\t"
00412             "por        %%mm2, %%mm0    \n\t"
00413             "por        %%mm5, %%mm3    \n\t"
00414             "psllq        $16, %%mm3    \n\t"
00415             "por        %%mm3, %%mm0    \n\t"
00416             MOVNTQ"     %%mm0, %0       \n\t"
00417             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00418         d += 4;
00419         s += 16;
00420     }
00421     __asm__ volatile(SFENCE:::"memory");
00422     __asm__ volatile(EMMS:::"memory");
00423     while (s < end) {
00424         register int rgb = *(const uint32_t*)s; s += 4;
00425         *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
00426     }
00427 }
00428 
00429 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
00430 {
00431     const uint8_t *s = src;
00432     const uint8_t *end;
00433     const uint8_t *mm_end;
00434     uint16_t *d = (uint16_t *)dst;
00435     end = s + src_size;
00436     mm_end = end - 15;
00437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
00438     __asm__ volatile(
00439         "movq           %3, %%mm5   \n\t"
00440         "movq           %4, %%mm6   \n\t"
00441         "movq           %5, %%mm7   \n\t"
00442         "jmp            2f          \n\t"
00443         ".p2align        4          \n\t"
00444         "1:                         \n\t"
00445         PREFETCH"   32(%1)          \n\t"
00446         "movd         (%1), %%mm0   \n\t"
00447         "movd        4(%1), %%mm3   \n\t"
00448         "punpckldq   8(%1), %%mm0   \n\t"
00449         "punpckldq  12(%1), %%mm3   \n\t"
00450         "movq        %%mm0, %%mm1   \n\t"
00451         "movq        %%mm3, %%mm4   \n\t"
00452         "pand        %%mm6, %%mm0   \n\t"
00453         "pand        %%mm6, %%mm3   \n\t"
00454         "pmaddwd     %%mm7, %%mm0   \n\t"
00455         "pmaddwd     %%mm7, %%mm3   \n\t"
00456         "pand        %%mm5, %%mm1   \n\t"
00457         "pand        %%mm5, %%mm4   \n\t"
00458         "por         %%mm1, %%mm0   \n\t"
00459         "por         %%mm4, %%mm3   \n\t"
00460         "psrld          $6, %%mm0   \n\t"
00461         "pslld         $10, %%mm3   \n\t"
00462         "por         %%mm3, %%mm0   \n\t"
00463         MOVNTQ"      %%mm0, (%0)    \n\t"
00464         "add           $16,  %1     \n\t"
00465         "add            $8,  %0     \n\t"
00466         "2:                         \n\t"
00467         "cmp            %2,  %1     \n\t"
00468         " jb            1b          \n\t"
00469         : "+r" (d), "+r"(s)
00470         : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
00471     );
00472 #else
00473     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00474     __asm__ volatile(
00475         "movq          %0, %%mm7    \n\t"
00476         "movq          %1, %%mm6    \n\t"
00477         ::"m"(red_15mask),"m"(green_15mask));
00478     while (s < mm_end) {
00479         __asm__ volatile(
00480             PREFETCH"    32%1           \n\t"
00481             "movd          %1, %%mm0    \n\t"
00482             "movd         4%1, %%mm3    \n\t"
00483             "punpckldq    8%1, %%mm0    \n\t"
00484             "punpckldq   12%1, %%mm3    \n\t"
00485             "movq       %%mm0, %%mm1    \n\t"
00486             "movq       %%mm0, %%mm2    \n\t"
00487             "movq       %%mm3, %%mm4    \n\t"
00488             "movq       %%mm3, %%mm5    \n\t"
00489             "psrlq         $3, %%mm0    \n\t"
00490             "psrlq         $3, %%mm3    \n\t"
00491             "pand          %2, %%mm0    \n\t"
00492             "pand          %2, %%mm3    \n\t"
00493             "psrlq         $6, %%mm1    \n\t"
00494             "psrlq         $6, %%mm4    \n\t"
00495             "pand       %%mm6, %%mm1    \n\t"
00496             "pand       %%mm6, %%mm4    \n\t"
00497             "psrlq         $9, %%mm2    \n\t"
00498             "psrlq         $9, %%mm5    \n\t"
00499             "pand       %%mm7, %%mm2    \n\t"
00500             "pand       %%mm7, %%mm5    \n\t"
00501             "por        %%mm1, %%mm0    \n\t"
00502             "por        %%mm4, %%mm3    \n\t"
00503             "por        %%mm2, %%mm0    \n\t"
00504             "por        %%mm5, %%mm3    \n\t"
00505             "psllq        $16, %%mm3    \n\t"
00506             "por        %%mm3, %%mm0    \n\t"
00507             MOVNTQ"     %%mm0, %0       \n\t"
00508             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00509         d += 4;
00510         s += 16;
00511     }
00512 #endif
00513     __asm__ volatile(SFENCE:::"memory");
00514     __asm__ volatile(EMMS:::"memory");
00515     while (s < end) {
00516         register int rgb = *(const uint32_t*)s; s += 4;
00517         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
00518     }
00519 }
00520 
00521 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
00522 {
00523     const uint8_t *s = src;
00524     const uint8_t *end;
00525     const uint8_t *mm_end;
00526     uint16_t *d = (uint16_t *)dst;
00527     end = s + src_size;
00528     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00529     __asm__ volatile(
00530         "movq          %0, %%mm7    \n\t"
00531         "movq          %1, %%mm6    \n\t"
00532         ::"m"(red_15mask),"m"(green_15mask));
00533     mm_end = end - 15;
00534     while (s < mm_end) {
00535         __asm__ volatile(
00536             PREFETCH"    32%1           \n\t"
00537             "movd          %1, %%mm0    \n\t"
00538             "movd         4%1, %%mm3    \n\t"
00539             "punpckldq    8%1, %%mm0    \n\t"
00540             "punpckldq   12%1, %%mm3    \n\t"
00541             "movq       %%mm0, %%mm1    \n\t"
00542             "movq       %%mm0, %%mm2    \n\t"
00543             "movq       %%mm3, %%mm4    \n\t"
00544             "movq       %%mm3, %%mm5    \n\t"
00545             "psllq         $7, %%mm0    \n\t"
00546             "psllq         $7, %%mm3    \n\t"
00547             "pand       %%mm7, %%mm0    \n\t"
00548             "pand       %%mm7, %%mm3    \n\t"
00549             "psrlq         $6, %%mm1    \n\t"
00550             "psrlq         $6, %%mm4    \n\t"
00551             "pand       %%mm6, %%mm1    \n\t"
00552             "pand       %%mm6, %%mm4    \n\t"
00553             "psrlq        $19, %%mm2    \n\t"
00554             "psrlq        $19, %%mm5    \n\t"
00555             "pand          %2, %%mm2    \n\t"
00556             "pand          %2, %%mm5    \n\t"
00557             "por        %%mm1, %%mm0    \n\t"
00558             "por        %%mm4, %%mm3    \n\t"
00559             "por        %%mm2, %%mm0    \n\t"
00560             "por        %%mm5, %%mm3    \n\t"
00561             "psllq        $16, %%mm3    \n\t"
00562             "por        %%mm3, %%mm0    \n\t"
00563             MOVNTQ"     %%mm0, %0       \n\t"
00564             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00565         d += 4;
00566         s += 16;
00567     }
00568     __asm__ volatile(SFENCE:::"memory");
00569     __asm__ volatile(EMMS:::"memory");
00570     while (s < end) {
00571         register int rgb = *(const uint32_t*)s; s += 4;
00572         *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
00573     }
00574 }
00575 
00576 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
00577 {
00578     const uint8_t *s = src;
00579     const uint8_t *end;
00580     const uint8_t *mm_end;
00581     uint16_t *d = (uint16_t *)dst;
00582     end = s + src_size;
00583     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00584     __asm__ volatile(
00585         "movq         %0, %%mm7     \n\t"
00586         "movq         %1, %%mm6     \n\t"
00587         ::"m"(red_16mask),"m"(green_16mask));
00588     mm_end = end - 11;
00589     while (s < mm_end) {
00590         __asm__ volatile(
00591             PREFETCH"    32%1           \n\t"
00592             "movd          %1, %%mm0    \n\t"
00593             "movd         3%1, %%mm3    \n\t"
00594             "punpckldq    6%1, %%mm0    \n\t"
00595             "punpckldq    9%1, %%mm3    \n\t"
00596             "movq       %%mm0, %%mm1    \n\t"
00597             "movq       %%mm0, %%mm2    \n\t"
00598             "movq       %%mm3, %%mm4    \n\t"
00599             "movq       %%mm3, %%mm5    \n\t"
00600             "psrlq         $3, %%mm0    \n\t"
00601             "psrlq         $3, %%mm3    \n\t"
00602             "pand          %2, %%mm0    \n\t"
00603             "pand          %2, %%mm3    \n\t"
00604             "psrlq         $5, %%mm1    \n\t"
00605             "psrlq         $5, %%mm4    \n\t"
00606             "pand       %%mm6, %%mm1    \n\t"
00607             "pand       %%mm6, %%mm4    \n\t"
00608             "psrlq         $8, %%mm2    \n\t"
00609             "psrlq         $8, %%mm5    \n\t"
00610             "pand       %%mm7, %%mm2    \n\t"
00611             "pand       %%mm7, %%mm5    \n\t"
00612             "por        %%mm1, %%mm0    \n\t"
00613             "por        %%mm4, %%mm3    \n\t"
00614             "por        %%mm2, %%mm0    \n\t"
00615             "por        %%mm5, %%mm3    \n\t"
00616             "psllq        $16, %%mm3    \n\t"
00617             "por        %%mm3, %%mm0    \n\t"
00618             MOVNTQ"     %%mm0, %0       \n\t"
00619             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00620         d += 4;
00621         s += 12;
00622     }
00623     __asm__ volatile(SFENCE:::"memory");
00624     __asm__ volatile(EMMS:::"memory");
00625     while (s < end) {
00626         const int b = *s++;
00627         const int g = *s++;
00628         const int r = *s++;
00629         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00630     }
00631 }
00632 
00633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
00634 {
00635     const uint8_t *s = src;
00636     const uint8_t *end;
00637     const uint8_t *mm_end;
00638     uint16_t *d = (uint16_t *)dst;
00639     end = s + src_size;
00640     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00641     __asm__ volatile(
00642         "movq         %0, %%mm7     \n\t"
00643         "movq         %1, %%mm6     \n\t"
00644         ::"m"(red_16mask),"m"(green_16mask));
00645     mm_end = end - 15;
00646     while (s < mm_end) {
00647         __asm__ volatile(
00648             PREFETCH"    32%1           \n\t"
00649             "movd          %1, %%mm0    \n\t"
00650             "movd         3%1, %%mm3    \n\t"
00651             "punpckldq    6%1, %%mm0    \n\t"
00652             "punpckldq    9%1, %%mm3    \n\t"
00653             "movq       %%mm0, %%mm1    \n\t"
00654             "movq       %%mm0, %%mm2    \n\t"
00655             "movq       %%mm3, %%mm4    \n\t"
00656             "movq       %%mm3, %%mm5    \n\t"
00657             "psllq         $8, %%mm0    \n\t"
00658             "psllq         $8, %%mm3    \n\t"
00659             "pand       %%mm7, %%mm0    \n\t"
00660             "pand       %%mm7, %%mm3    \n\t"
00661             "psrlq         $5, %%mm1    \n\t"
00662             "psrlq         $5, %%mm4    \n\t"
00663             "pand       %%mm6, %%mm1    \n\t"
00664             "pand       %%mm6, %%mm4    \n\t"
00665             "psrlq        $19, %%mm2    \n\t"
00666             "psrlq        $19, %%mm5    \n\t"
00667             "pand          %2, %%mm2    \n\t"
00668             "pand          %2, %%mm5    \n\t"
00669             "por        %%mm1, %%mm0    \n\t"
00670             "por        %%mm4, %%mm3    \n\t"
00671             "por        %%mm2, %%mm0    \n\t"
00672             "por        %%mm5, %%mm3    \n\t"
00673             "psllq        $16, %%mm3    \n\t"
00674             "por        %%mm3, %%mm0    \n\t"
00675             MOVNTQ"     %%mm0, %0       \n\t"
00676             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00677         d += 4;
00678         s += 12;
00679     }
00680     __asm__ volatile(SFENCE:::"memory");
00681     __asm__ volatile(EMMS:::"memory");
00682     while (s < end) {
00683         const int r = *s++;
00684         const int g = *s++;
00685         const int b = *s++;
00686         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00687     }
00688 }
00689 
00690 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
00691 {
00692     const uint8_t *s = src;
00693     const uint8_t *end;
00694     const uint8_t *mm_end;
00695     uint16_t *d = (uint16_t *)dst;
00696     end = s + src_size;
00697     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00698     __asm__ volatile(
00699         "movq          %0, %%mm7    \n\t"
00700         "movq          %1, %%mm6    \n\t"
00701         ::"m"(red_15mask),"m"(green_15mask));
00702     mm_end = end - 11;
00703     while (s < mm_end) {
00704         __asm__ volatile(
00705             PREFETCH"    32%1           \n\t"
00706             "movd          %1, %%mm0    \n\t"
00707             "movd         3%1, %%mm3    \n\t"
00708             "punpckldq    6%1, %%mm0    \n\t"
00709             "punpckldq    9%1, %%mm3    \n\t"
00710             "movq       %%mm0, %%mm1    \n\t"
00711             "movq       %%mm0, %%mm2    \n\t"
00712             "movq       %%mm3, %%mm4    \n\t"
00713             "movq       %%mm3, %%mm5    \n\t"
00714             "psrlq         $3, %%mm0    \n\t"
00715             "psrlq         $3, %%mm3    \n\t"
00716             "pand          %2, %%mm0    \n\t"
00717             "pand          %2, %%mm3    \n\t"
00718             "psrlq         $6, %%mm1    \n\t"
00719             "psrlq         $6, %%mm4    \n\t"
00720             "pand       %%mm6, %%mm1    \n\t"
00721             "pand       %%mm6, %%mm4    \n\t"
00722             "psrlq         $9, %%mm2    \n\t"
00723             "psrlq         $9, %%mm5    \n\t"
00724             "pand       %%mm7, %%mm2    \n\t"
00725             "pand       %%mm7, %%mm5    \n\t"
00726             "por        %%mm1, %%mm0    \n\t"
00727             "por        %%mm4, %%mm3    \n\t"
00728             "por        %%mm2, %%mm0    \n\t"
00729             "por        %%mm5, %%mm3    \n\t"
00730             "psllq        $16, %%mm3    \n\t"
00731             "por        %%mm3, %%mm0    \n\t"
00732             MOVNTQ"     %%mm0, %0       \n\t"
00733             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00734         d += 4;
00735         s += 12;
00736     }
00737     __asm__ volatile(SFENCE:::"memory");
00738     __asm__ volatile(EMMS:::"memory");
00739     while (s < end) {
00740         const int b = *s++;
00741         const int g = *s++;
00742         const int r = *s++;
00743         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00744     }
00745 }
00746 
00747 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
00748 {
00749     const uint8_t *s = src;
00750     const uint8_t *end;
00751     const uint8_t *mm_end;
00752     uint16_t *d = (uint16_t *)dst;
00753     end = s + src_size;
00754     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00755     __asm__ volatile(
00756         "movq         %0, %%mm7     \n\t"
00757         "movq         %1, %%mm6     \n\t"
00758         ::"m"(red_15mask),"m"(green_15mask));
00759     mm_end = end - 15;
00760     while (s < mm_end) {
00761         __asm__ volatile(
00762             PREFETCH"   32%1            \n\t"
00763             "movd         %1, %%mm0     \n\t"
00764             "movd        3%1, %%mm3     \n\t"
00765             "punpckldq   6%1, %%mm0     \n\t"
00766             "punpckldq   9%1, %%mm3     \n\t"
00767             "movq      %%mm0, %%mm1     \n\t"
00768             "movq      %%mm0, %%mm2     \n\t"
00769             "movq      %%mm3, %%mm4     \n\t"
00770             "movq      %%mm3, %%mm5     \n\t"
00771             "psllq        $7, %%mm0     \n\t"
00772             "psllq        $7, %%mm3     \n\t"
00773             "pand      %%mm7, %%mm0     \n\t"
00774             "pand      %%mm7, %%mm3     \n\t"
00775             "psrlq        $6, %%mm1     \n\t"
00776             "psrlq        $6, %%mm4     \n\t"
00777             "pand      %%mm6, %%mm1     \n\t"
00778             "pand      %%mm6, %%mm4     \n\t"
00779             "psrlq       $19, %%mm2     \n\t"
00780             "psrlq       $19, %%mm5     \n\t"
00781             "pand         %2, %%mm2     \n\t"
00782             "pand         %2, %%mm5     \n\t"
00783             "por       %%mm1, %%mm0     \n\t"
00784             "por       %%mm4, %%mm3     \n\t"
00785             "por       %%mm2, %%mm0     \n\t"
00786             "por       %%mm5, %%mm3     \n\t"
00787             "psllq       $16, %%mm3     \n\t"
00788             "por       %%mm3, %%mm0     \n\t"
00789             MOVNTQ"    %%mm0, %0        \n\t"
00790             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00791         d += 4;
00792         s += 12;
00793     }
00794     __asm__ volatile(SFENCE:::"memory");
00795     __asm__ volatile(EMMS:::"memory");
00796     while (s < end) {
00797         const int r = *s++;
00798         const int g = *s++;
00799         const int b = *s++;
00800         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00801     }
00802 }
00803 
00804 /*
00805   I use less accurate approximation here by simply left-shifting the input
00806   value and filling the low order bits with zeroes. This method improves PNG
00807   compression but this scheme cannot reproduce white exactly, since it does
00808   not generate an all-ones maximum value; the net effect is to darken the
00809   image slightly.
00810 
00811   The better method should be "left bit replication":
00812 
00813    4 3 2 1 0
00814    ---------
00815    1 1 0 1 1
00816 
00817    7 6 5 4 3  2 1 0
00818    ----------------
00819    1 1 0 1 1  1 1 0
00820    |=======|  |===|
00821        |      leftmost bits repeated to fill open bits
00822        |
00823    original bits
00824 */
00825 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00826 {
00827     const uint16_t *end;
00828     const uint16_t *mm_end;
00829     uint8_t *d = dst;
00830     const uint16_t *s = (const uint16_t*)src;
00831     end = s + src_size/2;
00832     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00833     mm_end = end - 7;
00834     while (s < mm_end) {
00835         __asm__ volatile(
00836             PREFETCH"    32%1           \n\t"
00837             "movq          %1, %%mm0    \n\t"
00838             "movq          %1, %%mm1    \n\t"
00839             "movq          %1, %%mm2    \n\t"
00840             "pand          %2, %%mm0    \n\t"
00841             "pand          %3, %%mm1    \n\t"
00842             "pand          %4, %%mm2    \n\t"
00843             "psllq         $3, %%mm0    \n\t"
00844             "psrlq         $2, %%mm1    \n\t"
00845             "psrlq         $7, %%mm2    \n\t"
00846             "movq       %%mm0, %%mm3    \n\t"
00847             "movq       %%mm1, %%mm4    \n\t"
00848             "movq       %%mm2, %%mm5    \n\t"
00849             "punpcklwd     %5, %%mm0    \n\t"
00850             "punpcklwd     %5, %%mm1    \n\t"
00851             "punpcklwd     %5, %%mm2    \n\t"
00852             "punpckhwd     %5, %%mm3    \n\t"
00853             "punpckhwd     %5, %%mm4    \n\t"
00854             "punpckhwd     %5, %%mm5    \n\t"
00855             "psllq         $8, %%mm1    \n\t"
00856             "psllq        $16, %%mm2    \n\t"
00857             "por        %%mm1, %%mm0    \n\t"
00858             "por        %%mm2, %%mm0    \n\t"
00859             "psllq         $8, %%mm4    \n\t"
00860             "psllq        $16, %%mm5    \n\t"
00861             "por        %%mm4, %%mm3    \n\t"
00862             "por        %%mm5, %%mm3    \n\t"
00863 
00864             "movq       %%mm0, %%mm6    \n\t"
00865             "movq       %%mm3, %%mm7    \n\t"
00866 
00867             "movq         8%1, %%mm0    \n\t"
00868             "movq         8%1, %%mm1    \n\t"
00869             "movq         8%1, %%mm2    \n\t"
00870             "pand          %2, %%mm0    \n\t"
00871             "pand          %3, %%mm1    \n\t"
00872             "pand          %4, %%mm2    \n\t"
00873             "psllq         $3, %%mm0    \n\t"
00874             "psrlq         $2, %%mm1    \n\t"
00875             "psrlq         $7, %%mm2    \n\t"
00876             "movq       %%mm0, %%mm3    \n\t"
00877             "movq       %%mm1, %%mm4    \n\t"
00878             "movq       %%mm2, %%mm5    \n\t"
00879             "punpcklwd     %5, %%mm0    \n\t"
00880             "punpcklwd     %5, %%mm1    \n\t"
00881             "punpcklwd     %5, %%mm2    \n\t"
00882             "punpckhwd     %5, %%mm3    \n\t"
00883             "punpckhwd     %5, %%mm4    \n\t"
00884             "punpckhwd     %5, %%mm5    \n\t"
00885             "psllq         $8, %%mm1    \n\t"
00886             "psllq        $16, %%mm2    \n\t"
00887             "por        %%mm1, %%mm0    \n\t"
00888             "por        %%mm2, %%mm0    \n\t"
00889             "psllq         $8, %%mm4    \n\t"
00890             "psllq        $16, %%mm5    \n\t"
00891             "por        %%mm4, %%mm3    \n\t"
00892             "por        %%mm5, %%mm3    \n\t"
00893 
00894             :"=m"(*d)
00895             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
00896             :"memory");
00897         /* borrowed 32 to 24 */
00898         __asm__ volatile(
00899             "movq       %%mm0, %%mm4    \n\t"
00900             "movq       %%mm3, %%mm5    \n\t"
00901             "movq       %%mm6, %%mm0    \n\t"
00902             "movq       %%mm7, %%mm1    \n\t"
00903 
00904             "movq       %%mm4, %%mm6    \n\t"
00905             "movq       %%mm5, %%mm7    \n\t"
00906             "movq       %%mm0, %%mm2    \n\t"
00907             "movq       %%mm1, %%mm3    \n\t"
00908 
00909             STORE_BGR24_MMX
00910 
00911             :"=m"(*d)
00912             :"m"(*s)
00913             :"memory");
00914         d += 24;
00915         s += 8;
00916     }
00917     __asm__ volatile(SFENCE:::"memory");
00918     __asm__ volatile(EMMS:::"memory");
00919     while (s < end) {
00920         register uint16_t bgr;
00921         bgr = *s++;
00922         *d++ = (bgr&0x1F)<<3;
00923         *d++ = (bgr&0x3E0)>>2;
00924         *d++ = (bgr&0x7C00)>>7;
00925     }
00926 }
00927 
00928 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00929 {
00930     const uint16_t *end;
00931     const uint16_t *mm_end;
00932     uint8_t *d = (uint8_t *)dst;
00933     const uint16_t *s = (const uint16_t *)src;
00934     end = s + src_size/2;
00935     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00936     mm_end = end - 7;
00937     while (s < mm_end) {
00938         __asm__ volatile(
00939             PREFETCH"    32%1           \n\t"
00940             "movq          %1, %%mm0    \n\t"
00941             "movq          %1, %%mm1    \n\t"
00942             "movq          %1, %%mm2    \n\t"
00943             "pand          %2, %%mm0    \n\t"
00944             "pand          %3, %%mm1    \n\t"
00945             "pand          %4, %%mm2    \n\t"
00946             "psllq         $3, %%mm0    \n\t"
00947             "psrlq         $3, %%mm1    \n\t"
00948             "psrlq         $8, %%mm2    \n\t"
00949             "movq       %%mm0, %%mm3    \n\t"
00950             "movq       %%mm1, %%mm4    \n\t"
00951             "movq       %%mm2, %%mm5    \n\t"
00952             "punpcklwd     %5, %%mm0    \n\t"
00953             "punpcklwd     %5, %%mm1    \n\t"
00954             "punpcklwd     %5, %%mm2    \n\t"
00955             "punpckhwd     %5, %%mm3    \n\t"
00956             "punpckhwd     %5, %%mm4    \n\t"
00957             "punpckhwd     %5, %%mm5    \n\t"
00958             "psllq         $8, %%mm1    \n\t"
00959             "psllq        $16, %%mm2    \n\t"
00960             "por        %%mm1, %%mm0    \n\t"
00961             "por        %%mm2, %%mm0    \n\t"
00962             "psllq         $8, %%mm4    \n\t"
00963             "psllq        $16, %%mm5    \n\t"
00964             "por        %%mm4, %%mm3    \n\t"
00965             "por        %%mm5, %%mm3    \n\t"
00966 
00967             "movq       %%mm0, %%mm6    \n\t"
00968             "movq       %%mm3, %%mm7    \n\t"
00969 
00970             "movq         8%1, %%mm0    \n\t"
00971             "movq         8%1, %%mm1    \n\t"
00972             "movq         8%1, %%mm2    \n\t"
00973             "pand          %2, %%mm0    \n\t"
00974             "pand          %3, %%mm1    \n\t"
00975             "pand          %4, %%mm2    \n\t"
00976             "psllq         $3, %%mm0    \n\t"
00977             "psrlq         $3, %%mm1    \n\t"
00978             "psrlq         $8, %%mm2    \n\t"
00979             "movq       %%mm0, %%mm3    \n\t"
00980             "movq       %%mm1, %%mm4    \n\t"
00981             "movq       %%mm2, %%mm5    \n\t"
00982             "punpcklwd     %5, %%mm0    \n\t"
00983             "punpcklwd     %5, %%mm1    \n\t"
00984             "punpcklwd     %5, %%mm2    \n\t"
00985             "punpckhwd     %5, %%mm3    \n\t"
00986             "punpckhwd     %5, %%mm4    \n\t"
00987             "punpckhwd     %5, %%mm5    \n\t"
00988             "psllq         $8, %%mm1    \n\t"
00989             "psllq        $16, %%mm2    \n\t"
00990             "por        %%mm1, %%mm0    \n\t"
00991             "por        %%mm2, %%mm0    \n\t"
00992             "psllq         $8, %%mm4    \n\t"
00993             "psllq        $16, %%mm5    \n\t"
00994             "por        %%mm4, %%mm3    \n\t"
00995             "por        %%mm5, %%mm3    \n\t"
00996             :"=m"(*d)
00997             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
00998             :"memory");
00999         /* borrowed 32 to 24 */
01000         __asm__ volatile(
01001             "movq       %%mm0, %%mm4    \n\t"
01002             "movq       %%mm3, %%mm5    \n\t"
01003             "movq       %%mm6, %%mm0    \n\t"
01004             "movq       %%mm7, %%mm1    \n\t"
01005 
01006             "movq       %%mm4, %%mm6    \n\t"
01007             "movq       %%mm5, %%mm7    \n\t"
01008             "movq       %%mm0, %%mm2    \n\t"
01009             "movq       %%mm1, %%mm3    \n\t"
01010 
01011             STORE_BGR24_MMX
01012 
01013             :"=m"(*d)
01014             :"m"(*s)
01015             :"memory");
01016         d += 24;
01017         s += 8;
01018     }
01019     __asm__ volatile(SFENCE:::"memory");
01020     __asm__ volatile(EMMS:::"memory");
01021     while (s < end) {
01022         register uint16_t bgr;
01023         bgr = *s++;
01024         *d++ = (bgr&0x1F)<<3;
01025         *d++ = (bgr&0x7E0)>>3;
01026         *d++ = (bgr&0xF800)>>8;
01027     }
01028 }
01029 
01030 /*
01031  * mm0 = 00 B3 00 B2 00 B1 00 B0
01032  * mm1 = 00 G3 00 G2 00 G1 00 G0
01033  * mm2 = 00 R3 00 R2 00 R1 00 R0
01034  * mm6 = FF FF FF FF FF FF FF FF
01035  * mm7 = 00 00 00 00 00 00 00 00
01036  */
01037 #define PACK_RGB32 \
01038     "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
01039     "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
01040     "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
01041     "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
01042     "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
01043     "movq       %%mm0, %%mm3    \n\t"                               \
01044     "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
01045     "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
01046     MOVNTQ"     %%mm0,  %0      \n\t"                               \
01047     MOVNTQ"     %%mm3, 8%0      \n\t"                               \
01048 
01049 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
01050 {
01051     const uint16_t *end;
01052     const uint16_t *mm_end;
01053     uint8_t *d = dst;
01054     const uint16_t *s = (const uint16_t *)src;
01055     end = s + src_size/2;
01056     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
01057     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
01058     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
01059     mm_end = end - 3;
01060     while (s < mm_end) {
01061         __asm__ volatile(
01062             PREFETCH"    32%1           \n\t"
01063             "movq          %1, %%mm0    \n\t"
01064             "movq          %1, %%mm1    \n\t"
01065             "movq          %1, %%mm2    \n\t"
01066             "pand          %2, %%mm0    \n\t"
01067             "pand          %3, %%mm1    \n\t"
01068             "pand          %4, %%mm2    \n\t"
01069             "psllq         $3, %%mm0    \n\t"
01070             "psrlq         $2, %%mm1    \n\t"
01071             "psrlq         $7, %%mm2    \n\t"
01072             PACK_RGB32
01073             :"=m"(*d)
01074             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
01075             :"memory");
01076         d += 16;
01077         s += 4;
01078     }
01079     __asm__ volatile(SFENCE:::"memory");
01080     __asm__ volatile(EMMS:::"memory");
01081     while (s < end) {
01082         register uint16_t bgr;
01083         bgr = *s++;
01084         *d++ = (bgr&0x1F)<<3;
01085         *d++ = (bgr&0x3E0)>>2;
01086         *d++ = (bgr&0x7C00)>>7;
01087         *d++ = 255;
01088     }
01089 }
01090 
01091 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
01092 {
01093     const uint16_t *end;
01094     const uint16_t *mm_end;
01095     uint8_t *d = dst;
01096     const uint16_t *s = (const uint16_t*)src;
01097     end = s + src_size/2;
01098     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
01099     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
01100     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
01101     mm_end = end - 3;
01102     while (s < mm_end) {
01103         __asm__ volatile(
01104             PREFETCH"    32%1           \n\t"
01105             "movq          %1, %%mm0    \n\t"
01106             "movq          %1, %%mm1    \n\t"
01107             "movq          %1, %%mm2    \n\t"
01108             "pand          %2, %%mm0    \n\t"
01109             "pand          %3, %%mm1    \n\t"
01110             "pand          %4, %%mm2    \n\t"
01111             "psllq         $3, %%mm0    \n\t"
01112             "psrlq         $3, %%mm1    \n\t"
01113             "psrlq         $8, %%mm2    \n\t"
01114             PACK_RGB32
01115             :"=m"(*d)
01116             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
01117             :"memory");
01118         d += 16;
01119         s += 4;
01120     }
01121     __asm__ volatile(SFENCE:::"memory");
01122     __asm__ volatile(EMMS:::"memory");
01123     while (s < end) {
01124         register uint16_t bgr;
01125         bgr = *s++;
01126         *d++ = (bgr&0x1F)<<3;
01127         *d++ = (bgr&0x7E0)>>3;
01128         *d++ = (bgr&0xF800)>>8;
01129         *d++ = 255;
01130     }
01131 }
01132 
01133 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
01134 {
01135     x86_reg idx = 15 - src_size;
01136     const uint8_t *s = src-idx;
01137     uint8_t *d = dst-idx;
01138     __asm__ volatile(
01139         "test          %0, %0           \n\t"
01140         "jns           2f               \n\t"
01141         PREFETCH"       (%1, %0)        \n\t"
01142         "movq          %3, %%mm7        \n\t"
01143         "pxor          %4, %%mm7        \n\t"
01144         "movq       %%mm7, %%mm6        \n\t"
01145         "pxor          %5, %%mm7        \n\t"
01146         ".p2align       4               \n\t"
01147         "1:                             \n\t"
01148         PREFETCH"     32(%1, %0)        \n\t"
01149         "movq           (%1, %0), %%mm0 \n\t"
01150         "movq          8(%1, %0), %%mm1 \n\t"
01151 # if COMPILE_TEMPLATE_MMX2
01152         "pshufw      $177, %%mm0, %%mm3 \n\t"
01153         "pshufw      $177, %%mm1, %%mm5 \n\t"
01154         "pand       %%mm7, %%mm0        \n\t"
01155         "pand       %%mm6, %%mm3        \n\t"
01156         "pand       %%mm7, %%mm1        \n\t"
01157         "pand       %%mm6, %%mm5        \n\t"
01158         "por        %%mm3, %%mm0        \n\t"
01159         "por        %%mm5, %%mm1        \n\t"
01160 # else
01161         "movq       %%mm0, %%mm2        \n\t"
01162         "movq       %%mm1, %%mm4        \n\t"
01163         "pand       %%mm7, %%mm0        \n\t"
01164         "pand       %%mm6, %%mm2        \n\t"
01165         "pand       %%mm7, %%mm1        \n\t"
01166         "pand       %%mm6, %%mm4        \n\t"
01167         "movq       %%mm2, %%mm3        \n\t"
01168         "movq       %%mm4, %%mm5        \n\t"
01169         "pslld        $16, %%mm2        \n\t"
01170         "psrld        $16, %%mm3        \n\t"
01171         "pslld        $16, %%mm4        \n\t"
01172         "psrld        $16, %%mm5        \n\t"
01173         "por        %%mm2, %%mm0        \n\t"
01174         "por        %%mm4, %%mm1        \n\t"
01175         "por        %%mm3, %%mm0        \n\t"
01176         "por        %%mm5, %%mm1        \n\t"
01177 # endif
01178         MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
01179         MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
01180         "add          $16, %0           \n\t"
01181         "js            1b               \n\t"
01182         SFENCE"                         \n\t"
01183         EMMS"                           \n\t"
01184         "2:                             \n\t"
01185         : "+&r"(idx)
01186         : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
01187         : "memory");
01188     for (; idx<15; idx+=4) {
01189         register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
01190         v &= 0xff00ff;
01191         *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
01192     }
01193 }
01194 
01195 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
01196 {
01197     unsigned i;
01198     x86_reg mmx_size= 23 - src_size;
01199     __asm__ volatile (
01200         "test             %%"REG_a", %%"REG_a"          \n\t"
01201         "jns                     2f                     \n\t"
01202         "movq     "MANGLE(mask24r)", %%mm5              \n\t"
01203         "movq     "MANGLE(mask24g)", %%mm6              \n\t"
01204         "movq     "MANGLE(mask24b)", %%mm7              \n\t"
01205         ".p2align                 4                     \n\t"
01206         "1:                                             \n\t"
01207         PREFETCH" 32(%1, %%"REG_a")                     \n\t"
01208         "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
01209         "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
01210         "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
01211         "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
01212         "pand                 %%mm5, %%mm0              \n\t"
01213         "pand                 %%mm6, %%mm1              \n\t"
01214         "pand                 %%mm7, %%mm2              \n\t"
01215         "por                  %%mm0, %%mm1              \n\t"
01216         "por                  %%mm2, %%mm1              \n\t"
01217         "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
01218         MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
01219         "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
01220         "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
01221         "pand                 %%mm7, %%mm0              \n\t"
01222         "pand                 %%mm5, %%mm1              \n\t"
01223         "pand                 %%mm6, %%mm2              \n\t"
01224         "por                  %%mm0, %%mm1              \n\t"
01225         "por                  %%mm2, %%mm1              \n\t"
01226         "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
01227         MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
01228         "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
01229         "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
01230         "pand                 %%mm6, %%mm0              \n\t"
01231         "pand                 %%mm7, %%mm1              \n\t"
01232         "pand                 %%mm5, %%mm2              \n\t"
01233         "por                  %%mm0, %%mm1              \n\t"
01234         "por                  %%mm2, %%mm1              \n\t"
01235         MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
01236         "add                    $24, %%"REG_a"          \n\t"
01237         " js                     1b                     \n\t"
01238         "2:                                             \n\t"
01239         : "+a" (mmx_size)
01240         : "r" (src-mmx_size), "r"(dst-mmx_size)
01241     );
01242 
01243     __asm__ volatile(SFENCE:::"memory");
01244     __asm__ volatile(EMMS:::"memory");
01245 
01246     if (mmx_size==23) return; //finished, was multiple of 8
01247 
01248     src+= src_size;
01249     dst+= src_size;
01250     src_size= 23-mmx_size;
01251     src-= src_size;
01252     dst-= src_size;
01253     for (i=0; i<src_size; i+=3) {
01254         register uint8_t x;
01255         x          = src[i + 2];
01256         dst[i + 1] = src[i + 1];
01257         dst[i + 2] = src[i + 0];
01258         dst[i + 0] = x;
01259     }
01260 }
01261 
01262 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01263                                            int width, int height,
01264                                            int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01265 {
01266     int y;
01267     const x86_reg chromWidth= width>>1;
01268     for (y=0; y<height; y++) {
01269         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
01270         __asm__ volatile(
01271             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01272             ".p2align                    4              \n\t"
01273             "1:                                         \n\t"
01274             PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
01275             PREFETCH"    32(%2, %%"REG_a")              \n\t"
01276             PREFETCH"    32(%3, %%"REG_a")              \n\t"
01277             "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
01278             "movq                    %%mm0, %%mm2       \n\t" // U(0)
01279             "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
01280             "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
01281             "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
01282 
01283             "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
01284             "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
01285             "movq                    %%mm3, %%mm4       \n\t" // Y(0)
01286             "movq                    %%mm5, %%mm6       \n\t" // Y(8)
01287             "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
01288             "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
01289             "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
01290             "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
01291 
01292             MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
01293             MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
01294             MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
01295             MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
01296 
01297             "add                        $8, %%"REG_a"   \n\t"
01298             "cmp                        %4, %%"REG_a"   \n\t"
01299             " jb                        1b              \n\t"
01300             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01301             : "%"REG_a
01302         );
01303         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
01304             usrc += chromStride;
01305             vsrc += chromStride;
01306         }
01307         ysrc += lumStride;
01308         dst  += dstStride;
01309     }
01310     __asm__(EMMS"       \n\t"
01311             SFENCE"     \n\t"
01312             :::"memory");
01313 }
01314 
01319 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01320                                       int width, int height,
01321                                       int lumStride, int chromStride, int dstStride)
01322 {
01323     //FIXME interpolate chroma
01324     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01325 }
01326 
01327 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01328                                            int width, int height,
01329                                            int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01330 {
01331     int y;
01332     const x86_reg chromWidth= width>>1;
01333     for (y=0; y<height; y++) {
01334         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
01335         __asm__ volatile(
01336             "xor                %%"REG_a", %%"REG_a"    \n\t"
01337             ".p2align                   4               \n\t"
01338             "1:                                         \n\t"
01339             PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
01340             PREFETCH"   32(%2, %%"REG_a")               \n\t"
01341             PREFETCH"   32(%3, %%"REG_a")               \n\t"
01342             "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
01343             "movq                   %%mm0, %%mm2        \n\t" // U(0)
01344             "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
01345             "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
01346             "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
01347 
01348             "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
01349             "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
01350             "movq                   %%mm0, %%mm4        \n\t" // Y(0)
01351             "movq                   %%mm2, %%mm6        \n\t" // Y(8)
01352             "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
01353             "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
01354             "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
01355             "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
01356 
01357             MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
01358             MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
01359             MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
01360             MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
01361 
01362             "add                       $8, %%"REG_a"    \n\t"
01363             "cmp                       %4, %%"REG_a"    \n\t"
01364             " jb                       1b               \n\t"
01365             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01366             : "%"REG_a
01367         );
01368         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
01369             usrc += chromStride;
01370             vsrc += chromStride;
01371         }
01372         ysrc += lumStride;
01373         dst += dstStride;
01374     }
01375     __asm__(EMMS"       \n\t"
01376             SFENCE"     \n\t"
01377             :::"memory");
01378 }
01379 
01384 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01385                                       int width, int height,
01386                                       int lumStride, int chromStride, int dstStride)
01387 {
01388     //FIXME interpolate chroma
01389     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01390 }
01391 
01395 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01396                                          int width, int height,
01397                                          int lumStride, int chromStride, int dstStride)
01398 {
01399     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01400 }
01401 
01405 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01406                                          int width, int height,
01407                                          int lumStride, int chromStride, int dstStride)
01408 {
01409     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01410 }
01411 
01416 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01417                                       int width, int height,
01418                                       int lumStride, int chromStride, int srcStride)
01419 {
01420     int y;
01421     const x86_reg chromWidth= width>>1;
01422     for (y=0; y<height; y+=2) {
01423         __asm__ volatile(
01424             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01425             "pcmpeqw                 %%mm7, %%mm7       \n\t"
01426             "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
01427             ".p2align                    4              \n\t"
01428             "1:                \n\t"
01429             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
01430             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
01431             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
01432             "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
01433             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
01434             "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
01435             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
01436             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
01437             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
01438             "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
01439             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
01440 
01441             MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
01442 
01443             "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
01444             "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
01445             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
01446             "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
01447             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
01448             "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
01449             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
01450             "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
01451             "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
01452             "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
01453 
01454             MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
01455 
01456             "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
01457             "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
01458             "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
01459             "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
01460             "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
01461             "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
01462             "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
01463             "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
01464 
01465             MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
01466             MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
01467 
01468             "add                        $8, %%"REG_a"   \n\t"
01469             "cmp                        %4, %%"REG_a"   \n\t"
01470             " jb                        1b              \n\t"
01471             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01472             : "memory", "%"REG_a
01473         );
01474 
01475         ydst += lumStride;
01476         src  += srcStride;
01477 
01478         __asm__ volatile(
01479             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01480             ".p2align                    4              \n\t"
01481             "1:                                         \n\t"
01482             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
01483             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
01484             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
01485             "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
01486             "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
01487             "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
01488             "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
01489             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
01490             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
01491             "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
01492             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
01493 
01494             MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
01495             MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
01496 
01497             "add                        $8, %%"REG_a"   \n\t"
01498             "cmp                        %4, %%"REG_a"   \n\t"
01499             " jb                        1b              \n\t"
01500 
01501             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01502             : "memory", "%"REG_a
01503         );
01504         udst += chromStride;
01505         vdst += chromStride;
01506         ydst += lumStride;
01507         src  += srcStride;
01508     }
01509     __asm__ volatile(EMMS"       \n\t"
01510                      SFENCE"     \n\t"
01511                      :::"memory");
01512 }
01513 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
01514 
01515 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
01516 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
01517 {
01518     int x,y;
01519 
01520     dst[0]= src[0];
01521 
01522     // first line
01523     for (x=0; x<srcWidth-1; x++) {
01524         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
01525         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
01526     }
01527     dst[2*srcWidth-1]= src[srcWidth-1];
01528 
01529     dst+= dstStride;
01530 
01531     for (y=1; y<srcHeight; y++) {
01532         const x86_reg mmxSize= srcWidth&~15;
01533         __asm__ volatile(
01534             "mov           %4, %%"REG_a"            \n\t"
01535             "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
01536             "movq         (%0, %%"REG_a"), %%mm4    \n\t"
01537             "movq                   %%mm4, %%mm2    \n\t"
01538             "psllq                     $8, %%mm4    \n\t"
01539             "pand                   %%mm0, %%mm2    \n\t"
01540             "por                    %%mm2, %%mm4    \n\t"
01541             "movq         (%1, %%"REG_a"), %%mm5    \n\t"
01542             "movq                   %%mm5, %%mm3    \n\t"
01543             "psllq                     $8, %%mm5    \n\t"
01544             "pand                   %%mm0, %%mm3    \n\t"
01545             "por                    %%mm3, %%mm5    \n\t"
01546             "1:                                     \n\t"
01547             "movq         (%0, %%"REG_a"), %%mm0    \n\t"
01548             "movq         (%1, %%"REG_a"), %%mm1    \n\t"
01549             "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
01550             "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
01551             PAVGB"                  %%mm0, %%mm5    \n\t"
01552             PAVGB"                  %%mm0, %%mm3    \n\t"
01553             PAVGB"                  %%mm0, %%mm5    \n\t"
01554             PAVGB"                  %%mm0, %%mm3    \n\t"
01555             PAVGB"                  %%mm1, %%mm4    \n\t"
01556             PAVGB"                  %%mm1, %%mm2    \n\t"
01557             PAVGB"                  %%mm1, %%mm4    \n\t"
01558             PAVGB"                  %%mm1, %%mm2    \n\t"
01559             "movq                   %%mm5, %%mm7    \n\t"
01560             "movq                   %%mm4, %%mm6    \n\t"
01561             "punpcklbw              %%mm3, %%mm5    \n\t"
01562             "punpckhbw              %%mm3, %%mm7    \n\t"
01563             "punpcklbw              %%mm2, %%mm4    \n\t"
01564             "punpckhbw              %%mm2, %%mm6    \n\t"
01565             MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
01566             MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
01567             MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
01568             MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
01569             "add                       $8, %%"REG_a"            \n\t"
01570             "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
01571             "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
01572             " js                       1b                       \n\t"
01573             :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
01574                "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
01575                "g" (-mmxSize)
01576             : "%"REG_a
01577         );
01578 
01579         for (x=mmxSize-1; x<srcWidth-1; x++) {
01580             dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
01581             dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
01582             dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
01583             dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
01584         }
01585         dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
01586         dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
01587 
01588         dst+=dstStride*2;
01589         src+=srcStride;
01590     }
01591 
01592     // last line
01593     dst[0]= src[0];
01594 
01595     for (x=0; x<srcWidth-1; x++) {
01596         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
01597         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
01598     }
01599     dst[2*srcWidth-1]= src[srcWidth-1];
01600 
01601     __asm__ volatile(EMMS"       \n\t"
01602                      SFENCE"     \n\t"
01603                      :::"memory");
01604 }
01605 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
01606 
01607 #if !COMPILE_TEMPLATE_AMD3DNOW
01608 
01614 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01615                                       int width, int height,
01616                                       int lumStride, int chromStride, int srcStride)
01617 {
01618     int y;
01619     const x86_reg chromWidth= width>>1;
01620     for (y=0; y<height; y+=2) {
01621         __asm__ volatile(
01622             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01623             "pcmpeqw             %%mm7, %%mm7   \n\t"
01624             "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
01625             ".p2align                4          \n\t"
01626             "1:                                 \n\t"
01627             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
01628             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
01629             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
01630             "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
01631             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
01632             "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
01633             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
01634             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
01635             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
01636             "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
01637             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
01638 
01639             MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
01640 
01641             "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
01642             "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
01643             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
01644             "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
01645             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
01646             "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
01647             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
01648             "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
01649             "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
01650             "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
01651 
01652             MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
01653 
01654             "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
01655             "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
01656             "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
01657             "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
01658             "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
01659             "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
01660             "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
01661             "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
01662 
01663             MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
01664             MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
01665 
01666             "add                    $8, %%"REG_a"   \n\t"
01667             "cmp                    %4, %%"REG_a"   \n\t"
01668             " jb                    1b          \n\t"
01669             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01670             : "memory", "%"REG_a
01671         );
01672 
01673         ydst += lumStride;
01674         src  += srcStride;
01675 
01676         __asm__ volatile(
01677             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01678             ".p2align                    4              \n\t"
01679             "1:                                 \n\t"
01680             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
01681             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
01682             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
01683             "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
01684             "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
01685             "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
01686             "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
01687             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
01688             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
01689             "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
01690             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
01691 
01692             MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
01693             MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
01694 
01695             "add                    $8, %%"REG_a"   \n\t"
01696             "cmp                    %4, %%"REG_a"   \n\t"
01697             " jb                    1b          \n\t"
01698 
01699             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01700             : "memory", "%"REG_a
01701         );
01702         udst += chromStride;
01703         vdst += chromStride;
01704         ydst += lumStride;
01705         src  += srcStride;
01706     }
01707     __asm__ volatile(EMMS"       \n\t"
01708                      SFENCE"     \n\t"
01709                      :::"memory");
01710 }
01711 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
01712 
01720 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01721                                        int width, int height,
01722                                        int lumStride, int chromStride, int srcStride)
01723 {
01724     int y;
01725     const x86_reg chromWidth= width>>1;
01726     for (y=0; y<height-2; y+=2) {
01727         int i;
01728         for (i=0; i<2; i++) {
01729             __asm__ volatile(
01730                 "mov                        %2, %%"REG_a"   \n\t"
01731                 "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
01732                 "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
01733                 "pxor                    %%mm7, %%mm7       \n\t"
01734                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
01735                 ".p2align                    4              \n\t"
01736                 "1:                                         \n\t"
01737                 PREFETCH"    64(%0, %%"REG_d")              \n\t"
01738                 "movd          (%0, %%"REG_d"), %%mm0       \n\t"
01739                 "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
01740                 "punpcklbw               %%mm7, %%mm0       \n\t"
01741                 "punpcklbw               %%mm7, %%mm1       \n\t"
01742                 "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
01743                 "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
01744                 "punpcklbw               %%mm7, %%mm2       \n\t"
01745                 "punpcklbw               %%mm7, %%mm3       \n\t"
01746                 "pmaddwd                 %%mm6, %%mm0       \n\t"
01747                 "pmaddwd                 %%mm6, %%mm1       \n\t"
01748                 "pmaddwd                 %%mm6, %%mm2       \n\t"
01749                 "pmaddwd                 %%mm6, %%mm3       \n\t"
01750 #ifndef FAST_BGR2YV12
01751                 "psrad                      $8, %%mm0       \n\t"
01752                 "psrad                      $8, %%mm1       \n\t"
01753                 "psrad                      $8, %%mm2       \n\t"
01754                 "psrad                      $8, %%mm3       \n\t"
01755 #endif
01756                 "packssdw                %%mm1, %%mm0       \n\t"
01757                 "packssdw                %%mm3, %%mm2       \n\t"
01758                 "pmaddwd                 %%mm5, %%mm0       \n\t"
01759                 "pmaddwd                 %%mm5, %%mm2       \n\t"
01760                 "packssdw                %%mm2, %%mm0       \n\t"
01761                 "psraw                      $7, %%mm0       \n\t"
01762 
01763                 "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
01764                 "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
01765                 "punpcklbw               %%mm7, %%mm4       \n\t"
01766                 "punpcklbw               %%mm7, %%mm1       \n\t"
01767                 "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
01768                 "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
01769                 "punpcklbw               %%mm7, %%mm2       \n\t"
01770                 "punpcklbw               %%mm7, %%mm3       \n\t"
01771                 "pmaddwd                 %%mm6, %%mm4       \n\t"
01772                 "pmaddwd                 %%mm6, %%mm1       \n\t"
01773                 "pmaddwd                 %%mm6, %%mm2       \n\t"
01774                 "pmaddwd                 %%mm6, %%mm3       \n\t"
01775 #ifndef FAST_BGR2YV12
01776                 "psrad                      $8, %%mm4       \n\t"
01777                 "psrad                      $8, %%mm1       \n\t"
01778                 "psrad                      $8, %%mm2       \n\t"
01779                 "psrad                      $8, %%mm3       \n\t"
01780 #endif
01781                 "packssdw                %%mm1, %%mm4       \n\t"
01782                 "packssdw                %%mm3, %%mm2       \n\t"
01783                 "pmaddwd                 %%mm5, %%mm4       \n\t"
01784                 "pmaddwd                 %%mm5, %%mm2       \n\t"
01785                 "add                       $24, %%"REG_d"   \n\t"
01786                 "packssdw                %%mm2, %%mm4       \n\t"
01787                 "psraw                      $7, %%mm4       \n\t"
01788 
01789                 "packuswb                %%mm4, %%mm0       \n\t"
01790                 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
01791 
01792                 MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
01793                 "add                        $8,      %%"REG_a"  \n\t"
01794                 " js                        1b                  \n\t"
01795                 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
01796                 : "%"REG_a, "%"REG_d
01797             );
01798             ydst += lumStride;
01799             src  += srcStride;
01800         }
01801         src -= srcStride*2;
01802         __asm__ volatile(
01803             "mov                        %4, %%"REG_a"   \n\t"
01804             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
01805             "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
01806             "pxor                    %%mm7, %%mm7       \n\t"
01807             "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
01808             "add                 %%"REG_d", %%"REG_d"   \n\t"
01809             ".p2align                    4              \n\t"
01810             "1:                                         \n\t"
01811             PREFETCH"    64(%0, %%"REG_d")              \n\t"
01812             PREFETCH"    64(%1, %%"REG_d")              \n\t"
01813 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
01814             "movq          (%0, %%"REG_d"), %%mm0       \n\t"
01815             "movq          (%1, %%"REG_d"), %%mm1       \n\t"
01816             "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
01817             "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
01818             PAVGB"                   %%mm1, %%mm0       \n\t"
01819             PAVGB"                   %%mm3, %%mm2       \n\t"
01820             "movq                    %%mm0, %%mm1       \n\t"
01821             "movq                    %%mm2, %%mm3       \n\t"
01822             "psrlq                     $24, %%mm0       \n\t"
01823             "psrlq                     $24, %%mm2       \n\t"
01824             PAVGB"                   %%mm1, %%mm0       \n\t"
01825             PAVGB"                   %%mm3, %%mm2       \n\t"
01826             "punpcklbw               %%mm7, %%mm0       \n\t"
01827             "punpcklbw               %%mm7, %%mm2       \n\t"
01828 #else
01829             "movd          (%0, %%"REG_d"), %%mm0       \n\t"
01830             "movd          (%1, %%"REG_d"), %%mm1       \n\t"
01831             "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
01832             "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
01833             "punpcklbw               %%mm7, %%mm0       \n\t"
01834             "punpcklbw               %%mm7, %%mm1       \n\t"
01835             "punpcklbw               %%mm7, %%mm2       \n\t"
01836             "punpcklbw               %%mm7, %%mm3       \n\t"
01837             "paddw                   %%mm1, %%mm0       \n\t"
01838             "paddw                   %%mm3, %%mm2       \n\t"
01839             "paddw                   %%mm2, %%mm0       \n\t"
01840             "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
01841             "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
01842             "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
01843             "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
01844             "punpcklbw               %%mm7, %%mm4       \n\t"
01845             "punpcklbw               %%mm7, %%mm1       \n\t"
01846             "punpcklbw               %%mm7, %%mm2       \n\t"
01847             "punpcklbw               %%mm7, %%mm3       \n\t"
01848             "paddw                   %%mm1, %%mm4       \n\t"
01849             "paddw                   %%mm3, %%mm2       \n\t"
01850             "paddw                   %%mm4, %%mm2       \n\t"
01851             "psrlw                      $2, %%mm0       \n\t"
01852             "psrlw                      $2, %%mm2       \n\t"
01853 #endif
01854             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
01855             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
01856 
01857             "pmaddwd                 %%mm0, %%mm1       \n\t"
01858             "pmaddwd                 %%mm2, %%mm3       \n\t"
01859             "pmaddwd                 %%mm6, %%mm0       \n\t"
01860             "pmaddwd                 %%mm6, %%mm2       \n\t"
01861 #ifndef FAST_BGR2YV12
01862             "psrad                      $8, %%mm0       \n\t"
01863             "psrad                      $8, %%mm1       \n\t"
01864             "psrad                      $8, %%mm2       \n\t"
01865             "psrad                      $8, %%mm3       \n\t"
01866 #endif
01867             "packssdw                %%mm2, %%mm0       \n\t"
01868             "packssdw                %%mm3, %%mm1       \n\t"
01869             "pmaddwd                 %%mm5, %%mm0       \n\t"
01870             "pmaddwd                 %%mm5, %%mm1       \n\t"
01871             "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
01872             "psraw                      $7, %%mm0       \n\t"
01873 
01874 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
01875             "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
01876             "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
01877             "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
01878             "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
01879             PAVGB"                   %%mm1, %%mm4       \n\t"
01880             PAVGB"                   %%mm3, %%mm2       \n\t"
01881             "movq                    %%mm4, %%mm1       \n\t"
01882             "movq                    %%mm2, %%mm3       \n\t"
01883             "psrlq                     $24, %%mm4       \n\t"
01884             "psrlq                     $24, %%mm2       \n\t"
01885             PAVGB"                   %%mm1, %%mm4       \n\t"
01886             PAVGB"                   %%mm3, %%mm2       \n\t"
01887             "punpcklbw               %%mm7, %%mm4       \n\t"
01888             "punpcklbw               %%mm7, %%mm2       \n\t"
01889 #else
01890             "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
01891             "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
01892             "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
01893             "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
01894             "punpcklbw               %%mm7, %%mm4       \n\t"
01895             "punpcklbw               %%mm7, %%mm1       \n\t"
01896             "punpcklbw               %%mm7, %%mm2       \n\t"
01897             "punpcklbw               %%mm7, %%mm3       \n\t"
01898             "paddw                   %%mm1, %%mm4       \n\t"
01899             "paddw                   %%mm3, %%mm2       \n\t"
01900             "paddw                   %%mm2, %%mm4       \n\t"
01901             "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
01902             "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
01903             "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
01904             "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
01905             "punpcklbw               %%mm7, %%mm5       \n\t"
01906             "punpcklbw               %%mm7, %%mm1       \n\t"
01907             "punpcklbw               %%mm7, %%mm2       \n\t"
01908             "punpcklbw               %%mm7, %%mm3       \n\t"
01909             "paddw                   %%mm1, %%mm5       \n\t"
01910             "paddw                   %%mm3, %%mm2       \n\t"
01911             "paddw                   %%mm5, %%mm2       \n\t"
01912             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
01913             "psrlw                      $2, %%mm4       \n\t"
01914             "psrlw                      $2, %%mm2       \n\t"
01915 #endif
01916             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
01917             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
01918 
01919             "pmaddwd                 %%mm4, %%mm1       \n\t"
01920             "pmaddwd                 %%mm2, %%mm3       \n\t"
01921             "pmaddwd                 %%mm6, %%mm4       \n\t"
01922             "pmaddwd                 %%mm6, %%mm2       \n\t"
01923 #ifndef FAST_BGR2YV12
01924             "psrad                      $8, %%mm4       \n\t"
01925             "psrad                      $8, %%mm1       \n\t"
01926             "psrad                      $8, %%mm2       \n\t"
01927             "psrad                      $8, %%mm3       \n\t"
01928 #endif
01929             "packssdw                %%mm2, %%mm4       \n\t"
01930             "packssdw                %%mm3, %%mm1       \n\t"
01931             "pmaddwd                 %%mm5, %%mm4       \n\t"
01932             "pmaddwd                 %%mm5, %%mm1       \n\t"
01933             "add                       $24, %%"REG_d"   \n\t"
01934             "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
01935             "psraw                      $7, %%mm4       \n\t"
01936 
01937             "movq                    %%mm0, %%mm1           \n\t"
01938             "punpckldq               %%mm4, %%mm0           \n\t"
01939             "punpckhdq               %%mm4, %%mm1           \n\t"
01940             "packsswb                %%mm1, %%mm0           \n\t"
01941             "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
01942             "movd                    %%mm0, (%2, %%"REG_a") \n\t"
01943             "punpckhdq               %%mm0, %%mm0           \n\t"
01944             "movd                    %%mm0, (%3, %%"REG_a") \n\t"
01945             "add                        $4, %%"REG_a"       \n\t"
01946             " js                        1b                  \n\t"
01947             : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
01948             : "%"REG_a, "%"REG_d
01949         );
01950 
01951         udst += chromStride;
01952         vdst += chromStride;
01953         src  += srcStride*2;
01954     }
01955 
01956     __asm__ volatile(EMMS"       \n\t"
01957                      SFENCE"     \n\t"
01958                      :::"memory");
01959 
01960      rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
01961 }
01962 #endif /* !COMPILE_TEMPLATE_SSE2 */
01963 
01964 #if !COMPILE_TEMPLATE_AMD3DNOW
01965 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
01966                                     int width, int height, int src1Stride,
01967                                     int src2Stride, int dstStride)
01968 {
01969     int h;
01970 
01971     for (h=0; h < height; h++) {
01972         int w;
01973 
01974 #if COMPILE_TEMPLATE_SSE2
01975         __asm__(
01976             "xor              %%"REG_a", %%"REG_a"  \n\t"
01977             "1:                                     \n\t"
01978             PREFETCH" 64(%1, %%"REG_a")             \n\t"
01979             PREFETCH" 64(%2, %%"REG_a")             \n\t"
01980             "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
01981             "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
01982             "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
01983             "punpcklbw           %%xmm2, %%xmm0     \n\t"
01984             "punpckhbw           %%xmm2, %%xmm1     \n\t"
01985             "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
01986             "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
01987             "add                    $16, %%"REG_a"  \n\t"
01988             "cmp                     %3, %%"REG_a"  \n\t"
01989             " jb                     1b             \n\t"
01990             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
01991             : "memory", "%"REG_a""
01992         );
01993 #else
01994         __asm__(
01995             "xor %%"REG_a", %%"REG_a"               \n\t"
01996             "1:                                     \n\t"
01997             PREFETCH" 64(%1, %%"REG_a")             \n\t"
01998             PREFETCH" 64(%2, %%"REG_a")             \n\t"
01999             "movq       (%1, %%"REG_a"), %%mm0      \n\t"
02000             "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
02001             "movq                 %%mm0, %%mm1      \n\t"
02002             "movq                 %%mm2, %%mm3      \n\t"
02003             "movq       (%2, %%"REG_a"), %%mm4      \n\t"
02004             "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
02005             "punpcklbw            %%mm4, %%mm0      \n\t"
02006             "punpckhbw            %%mm4, %%mm1      \n\t"
02007             "punpcklbw            %%mm5, %%mm2      \n\t"
02008             "punpckhbw            %%mm5, %%mm3      \n\t"
02009             MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
02010             MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
02011             MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
02012             MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
02013             "add                    $16, %%"REG_a"  \n\t"
02014             "cmp                     %3, %%"REG_a"  \n\t"
02015             " jb                     1b             \n\t"
02016             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
02017             : "memory", "%"REG_a
02018         );
02019 #endif
02020         for (w= (width&(~15)); w < width; w++) {
02021             dest[2*w+0] = src1[w];
02022             dest[2*w+1] = src2[w];
02023         }
02024         dest += dstStride;
02025         src1 += src1Stride;
02026         src2 += src2Stride;
02027     }
02028     __asm__(
02029             EMMS"       \n\t"
02030             SFENCE"     \n\t"
02031             ::: "memory"
02032             );
02033 }
02034 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02035 
02036 #if !COMPILE_TEMPLATE_SSE2
02037 #if !COMPILE_TEMPLATE_AMD3DNOW
02038 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
02039                                        uint8_t *dst1, uint8_t *dst2,
02040                                        int width, int height,
02041                                        int srcStride1, int srcStride2,
02042                                        int dstStride1, int dstStride2)
02043 {
02044     x86_reg y;
02045     int x,w,h;
02046     w=width/2; h=height/2;
02047     __asm__ volatile(
02048         PREFETCH" %0    \n\t"
02049         PREFETCH" %1    \n\t"
02050         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
02051     for (y=0;y<h;y++) {
02052         const uint8_t* s1=src1+srcStride1*(y>>1);
02053         uint8_t* d=dst1+dstStride1*y;
02054         x=0;
02055         for (;x<w-31;x+=32) {
02056             __asm__ volatile(
02057                 PREFETCH"   32%1        \n\t"
02058                 "movq         %1, %%mm0 \n\t"
02059                 "movq        8%1, %%mm2 \n\t"
02060                 "movq       16%1, %%mm4 \n\t"
02061                 "movq       24%1, %%mm6 \n\t"
02062                 "movq      %%mm0, %%mm1 \n\t"
02063                 "movq      %%mm2, %%mm3 \n\t"
02064                 "movq      %%mm4, %%mm5 \n\t"
02065                 "movq      %%mm6, %%mm7 \n\t"
02066                 "punpcklbw %%mm0, %%mm0 \n\t"
02067                 "punpckhbw %%mm1, %%mm1 \n\t"
02068                 "punpcklbw %%mm2, %%mm2 \n\t"
02069                 "punpckhbw %%mm3, %%mm3 \n\t"
02070                 "punpcklbw %%mm4, %%mm4 \n\t"
02071                 "punpckhbw %%mm5, %%mm5 \n\t"
02072                 "punpcklbw %%mm6, %%mm6 \n\t"
02073                 "punpckhbw %%mm7, %%mm7 \n\t"
02074                 MOVNTQ"    %%mm0,   %0  \n\t"
02075                 MOVNTQ"    %%mm1,  8%0  \n\t"
02076                 MOVNTQ"    %%mm2, 16%0  \n\t"
02077                 MOVNTQ"    %%mm3, 24%0  \n\t"
02078                 MOVNTQ"    %%mm4, 32%0  \n\t"
02079                 MOVNTQ"    %%mm5, 40%0  \n\t"
02080                 MOVNTQ"    %%mm6, 48%0  \n\t"
02081                 MOVNTQ"    %%mm7, 56%0"
02082                 :"=m"(d[2*x])
02083                 :"m"(s1[x])
02084                 :"memory");
02085         }
02086         for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
02087     }
02088     for (y=0;y<h;y++) {
02089         const uint8_t* s2=src2+srcStride2*(y>>1);
02090         uint8_t* d=dst2+dstStride2*y;
02091         x=0;
02092         for (;x<w-31;x+=32) {
02093             __asm__ volatile(
02094                 PREFETCH"   32%1        \n\t"
02095                 "movq         %1, %%mm0 \n\t"
02096                 "movq        8%1, %%mm2 \n\t"
02097                 "movq       16%1, %%mm4 \n\t"
02098                 "movq       24%1, %%mm6 \n\t"
02099                 "movq      %%mm0, %%mm1 \n\t"
02100                 "movq      %%mm2, %%mm3 \n\t"
02101                 "movq      %%mm4, %%mm5 \n\t"
02102                 "movq      %%mm6, %%mm7 \n\t"
02103                 "punpcklbw %%mm0, %%mm0 \n\t"
02104                 "punpckhbw %%mm1, %%mm1 \n\t"
02105                 "punpcklbw %%mm2, %%mm2 \n\t"
02106                 "punpckhbw %%mm3, %%mm3 \n\t"
02107                 "punpcklbw %%mm4, %%mm4 \n\t"
02108                 "punpckhbw %%mm5, %%mm5 \n\t"
02109                 "punpcklbw %%mm6, %%mm6 \n\t"
02110                 "punpckhbw %%mm7, %%mm7 \n\t"
02111                 MOVNTQ"    %%mm0,   %0  \n\t"
02112                 MOVNTQ"    %%mm1,  8%0  \n\t"
02113                 MOVNTQ"    %%mm2, 16%0  \n\t"
02114                 MOVNTQ"    %%mm3, 24%0  \n\t"
02115                 MOVNTQ"    %%mm4, 32%0  \n\t"
02116                 MOVNTQ"    %%mm5, 40%0  \n\t"
02117                 MOVNTQ"    %%mm6, 48%0  \n\t"
02118                 MOVNTQ"    %%mm7, 56%0"
02119                 :"=m"(d[2*x])
02120                 :"m"(s2[x])
02121                 :"memory");
02122         }
02123         for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
02124     }
02125     __asm__(
02126             EMMS"       \n\t"
02127             SFENCE"     \n\t"
02128             ::: "memory"
02129         );
02130 }
02131 
02132 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
02133                                         uint8_t *dst,
02134                                         int width, int height,
02135                                         int srcStride1, int srcStride2,
02136                                         int srcStride3, int dstStride)
02137 {
02138     x86_reg x;
02139     int y,w,h;
02140     w=width/2; h=height;
02141     for (y=0;y<h;y++) {
02142         const uint8_t* yp=src1+srcStride1*y;
02143         const uint8_t* up=src2+srcStride2*(y>>2);
02144         const uint8_t* vp=src3+srcStride3*(y>>2);
02145         uint8_t* d=dst+dstStride*y;
02146         x=0;
02147         for (;x<w-7;x+=8) {
02148             __asm__ volatile(
02149                 PREFETCH"   32(%1, %0)          \n\t"
02150                 PREFETCH"   32(%2, %0)          \n\t"
02151                 PREFETCH"   32(%3, %0)          \n\t"
02152                 "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
02153                 "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
02154                 "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
02155                 "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
02156                 "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
02157                 "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
02158                 "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
02159                 "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
02160                 "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
02161                 "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
02162 
02163                 "movq            %%mm1, %%mm6   \n\t"
02164                 "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
02165                 "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
02166                 "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
02167                 MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
02168                 MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
02169 
02170                 "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
02171                 "movq     8(%1, %0, 4), %%mm0   \n\t"
02172                 "movq            %%mm0, %%mm3   \n\t"
02173                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
02174                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
02175                 MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
02176                 MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
02177 
02178                 "movq            %%mm4, %%mm6   \n\t"
02179                 "movq    16(%1, %0, 4), %%mm0   \n\t"
02180                 "movq            %%mm0, %%mm3   \n\t"
02181                 "punpcklbw       %%mm5, %%mm4   \n\t"
02182                 "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
02183                 "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
02184                 MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
02185                 MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
02186 
02187                 "punpckhbw       %%mm5, %%mm6   \n\t"
02188                 "movq    24(%1, %0, 4), %%mm0   \n\t"
02189                 "movq            %%mm0, %%mm3   \n\t"
02190                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
02191                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
02192                 MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
02193                 MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
02194 
02195                 : "+r" (x)
02196                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
02197                 :"memory");
02198         }
02199         for (; x<w; x++) {
02200             const int x2 = x<<2;
02201             d[8*x+0] = yp[x2];
02202             d[8*x+1] = up[x];
02203             d[8*x+2] = yp[x2+1];
02204             d[8*x+3] = vp[x];
02205             d[8*x+4] = yp[x2+2];
02206             d[8*x+5] = up[x];
02207             d[8*x+6] = yp[x2+3];
02208             d[8*x+7] = vp[x];
02209         }
02210     }
02211     __asm__(
02212             EMMS"       \n\t"
02213             SFENCE"     \n\t"
02214             ::: "memory"
02215         );
02216 }
02217 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02218 
02219 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
02220 {
02221     dst +=   count;
02222     src += 2*count;
02223     count= - count;
02224 
02225     if(count <= -16) {
02226         count += 15;
02227         __asm__ volatile(
02228             "pcmpeqw       %%mm7, %%mm7        \n\t"
02229             "psrlw            $8, %%mm7        \n\t"
02230             "1:                                \n\t"
02231             "movq -30(%1, %0, 2), %%mm0        \n\t"
02232             "movq -22(%1, %0, 2), %%mm1        \n\t"
02233             "movq -14(%1, %0, 2), %%mm2        \n\t"
02234             "movq  -6(%1, %0, 2), %%mm3        \n\t"
02235             "pand          %%mm7, %%mm0        \n\t"
02236             "pand          %%mm7, %%mm1        \n\t"
02237             "pand          %%mm7, %%mm2        \n\t"
02238             "pand          %%mm7, %%mm3        \n\t"
02239             "packuswb      %%mm1, %%mm0        \n\t"
02240             "packuswb      %%mm3, %%mm2        \n\t"
02241             MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
02242             MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
02243             "add             $16, %0           \n\t"
02244             " js 1b                            \n\t"
02245             : "+r"(count)
02246             : "r"(src), "r"(dst)
02247         );
02248         count -= 15;
02249     }
02250     while(count<0) {
02251         dst[count]= src[2*count];
02252         count++;
02253     }
02254 }
02255 
02256 #if !COMPILE_TEMPLATE_AMD3DNOW
02257 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02258 {
02259     dst0+=   count;
02260     dst1+=   count;
02261     src += 4*count;
02262     count= - count;
02263     if(count <= -8) {
02264         count += 7;
02265         __asm__ volatile(
02266             "pcmpeqw       %%mm7, %%mm7        \n\t"
02267             "psrlw            $8, %%mm7        \n\t"
02268             "1:                                \n\t"
02269             "movq -28(%1, %0, 4), %%mm0        \n\t"
02270             "movq -20(%1, %0, 4), %%mm1        \n\t"
02271             "movq -12(%1, %0, 4), %%mm2        \n\t"
02272             "movq  -4(%1, %0, 4), %%mm3        \n\t"
02273             "pand          %%mm7, %%mm0        \n\t"
02274             "pand          %%mm7, %%mm1        \n\t"
02275             "pand          %%mm7, %%mm2        \n\t"
02276             "pand          %%mm7, %%mm3        \n\t"
02277             "packuswb      %%mm1, %%mm0        \n\t"
02278             "packuswb      %%mm3, %%mm2        \n\t"
02279             "movq          %%mm0, %%mm1        \n\t"
02280             "movq          %%mm2, %%mm3        \n\t"
02281             "psrlw            $8, %%mm0        \n\t"
02282             "psrlw            $8, %%mm2        \n\t"
02283             "pand          %%mm7, %%mm1        \n\t"
02284             "pand          %%mm7, %%mm3        \n\t"
02285             "packuswb      %%mm2, %%mm0        \n\t"
02286             "packuswb      %%mm3, %%mm1        \n\t"
02287             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
02288             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
02289             "add              $8, %0           \n\t"
02290             " js 1b                            \n\t"
02291             : "+r"(count)
02292             : "r"(src), "r"(dst0), "r"(dst1)
02293         );
02294         count -= 7;
02295     }
02296     while(count<0) {
02297         dst0[count]= src[4*count+0];
02298         dst1[count]= src[4*count+2];
02299         count++;
02300     }
02301 }
02302 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02303 
02304 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02305 {
02306     dst0 +=   count;
02307     dst1 +=   count;
02308     src0 += 4*count;
02309     src1 += 4*count;
02310     count= - count;
02311 #ifdef PAVGB
02312     if(count <= -8) {
02313         count += 7;
02314         __asm__ volatile(
02315             "pcmpeqw        %%mm7, %%mm7        \n\t"
02316             "psrlw             $8, %%mm7        \n\t"
02317             "1:                                \n\t"
02318             "movq  -28(%1, %0, 4), %%mm0        \n\t"
02319             "movq  -20(%1, %0, 4), %%mm1        \n\t"
02320             "movq  -12(%1, %0, 4), %%mm2        \n\t"
02321             "movq   -4(%1, %0, 4), %%mm3        \n\t"
02322             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
02323             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
02324             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
02325             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
02326             "pand           %%mm7, %%mm0        \n\t"
02327             "pand           %%mm7, %%mm1        \n\t"
02328             "pand           %%mm7, %%mm2        \n\t"
02329             "pand           %%mm7, %%mm3        \n\t"
02330             "packuswb       %%mm1, %%mm0        \n\t"
02331             "packuswb       %%mm3, %%mm2        \n\t"
02332             "movq           %%mm0, %%mm1        \n\t"
02333             "movq           %%mm2, %%mm3        \n\t"
02334             "psrlw             $8, %%mm0        \n\t"
02335             "psrlw             $8, %%mm2        \n\t"
02336             "pand           %%mm7, %%mm1        \n\t"
02337             "pand           %%mm7, %%mm3        \n\t"
02338             "packuswb       %%mm2, %%mm0        \n\t"
02339             "packuswb       %%mm3, %%mm1        \n\t"
02340             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
02341             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
02342             "add               $8, %0           \n\t"
02343             " js 1b                            \n\t"
02344             : "+r"(count)
02345             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
02346         );
02347         count -= 7;
02348     }
02349 #endif
02350     while(count<0) {
02351         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
02352         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
02353         count++;
02354     }
02355 }
02356 
02357 #if !COMPILE_TEMPLATE_AMD3DNOW
02358 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02359 {
02360     dst0+=   count;
02361     dst1+=   count;
02362     src += 4*count;
02363     count= - count;
02364     if(count <= -8) {
02365         count += 7;
02366         __asm__ volatile(
02367             "pcmpeqw       %%mm7, %%mm7        \n\t"
02368             "psrlw            $8, %%mm7        \n\t"
02369             "1:                                \n\t"
02370             "movq -28(%1, %0, 4), %%mm0        \n\t"
02371             "movq -20(%1, %0, 4), %%mm1        \n\t"
02372             "movq -12(%1, %0, 4), %%mm2        \n\t"
02373             "movq  -4(%1, %0, 4), %%mm3        \n\t"
02374             "psrlw            $8, %%mm0        \n\t"
02375             "psrlw            $8, %%mm1        \n\t"
02376             "psrlw            $8, %%mm2        \n\t"
02377             "psrlw            $8, %%mm3        \n\t"
02378             "packuswb      %%mm1, %%mm0        \n\t"
02379             "packuswb      %%mm3, %%mm2        \n\t"
02380             "movq          %%mm0, %%mm1        \n\t"
02381             "movq          %%mm2, %%mm3        \n\t"
02382             "psrlw            $8, %%mm0        \n\t"
02383             "psrlw            $8, %%mm2        \n\t"
02384             "pand          %%mm7, %%mm1        \n\t"
02385             "pand          %%mm7, %%mm3        \n\t"
02386             "packuswb      %%mm2, %%mm0        \n\t"
02387             "packuswb      %%mm3, %%mm1        \n\t"
02388             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
02389             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
02390             "add              $8, %0           \n\t"
02391             " js 1b                            \n\t"
02392             : "+r"(count)
02393             : "r"(src), "r"(dst0), "r"(dst1)
02394         );
02395         count -= 7;
02396     }
02397     src++;
02398     while(count<0) {
02399         dst0[count]= src[4*count+0];
02400         dst1[count]= src[4*count+2];
02401         count++;
02402     }
02403 }
02404 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02405 
02406 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02407 {
02408     dst0 +=   count;
02409     dst1 +=   count;
02410     src0 += 4*count;
02411     src1 += 4*count;
02412     count= - count;
02413 #ifdef PAVGB
02414     if(count <= -8) {
02415         count += 7;
02416         __asm__ volatile(
02417             "pcmpeqw        %%mm7, %%mm7        \n\t"
02418             "psrlw             $8, %%mm7        \n\t"
02419             "1:                                \n\t"
02420             "movq  -28(%1, %0, 4), %%mm0        \n\t"
02421             "movq  -20(%1, %0, 4), %%mm1        \n\t"
02422             "movq  -12(%1, %0, 4), %%mm2        \n\t"
02423             "movq   -4(%1, %0, 4), %%mm3        \n\t"
02424             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
02425             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
02426             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
02427             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
02428             "psrlw             $8, %%mm0        \n\t"
02429             "psrlw             $8, %%mm1        \n\t"
02430             "psrlw             $8, %%mm2        \n\t"
02431             "psrlw             $8, %%mm3        \n\t"
02432             "packuswb       %%mm1, %%mm0        \n\t"
02433             "packuswb       %%mm3, %%mm2        \n\t"
02434             "movq           %%mm0, %%mm1        \n\t"
02435             "movq           %%mm2, %%mm3        \n\t"
02436             "psrlw             $8, %%mm0        \n\t"
02437             "psrlw             $8, %%mm2        \n\t"
02438             "pand           %%mm7, %%mm1        \n\t"
02439             "pand           %%mm7, %%mm3        \n\t"
02440             "packuswb       %%mm2, %%mm0        \n\t"
02441             "packuswb       %%mm3, %%mm1        \n\t"
02442             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
02443             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
02444             "add               $8, %0           \n\t"
02445             " js 1b                            \n\t"
02446             : "+r"(count)
02447             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
02448         );
02449         count -= 7;
02450     }
02451 #endif
02452     src0++;
02453     src1++;
02454     while(count<0) {
02455         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
02456         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
02457         count++;
02458     }
02459 }
02460 
02461 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02462                                  int width, int height,
02463                                  int lumStride, int chromStride, int srcStride)
02464 {
02465     int y;
02466     const int chromWidth= -((-width)>>1);
02467 
02468     for (y=0; y<height; y++) {
02469         RENAME(extract_even)(src, ydst, width);
02470         if(y&1) {
02471             RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
02472             udst+= chromStride;
02473             vdst+= chromStride;
02474         }
02475 
02476         src += srcStride;
02477         ydst+= lumStride;
02478     }
02479     __asm__(
02480             EMMS"       \n\t"
02481             SFENCE"     \n\t"
02482             ::: "memory"
02483         );
02484 }
02485 
02486 #if !COMPILE_TEMPLATE_AMD3DNOW
02487 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02488                                  int width, int height,
02489                                  int lumStride, int chromStride, int srcStride)
02490 {
02491     int y;
02492     const int chromWidth= -((-width)>>1);
02493 
02494     for (y=0; y<height; y++) {
02495         RENAME(extract_even)(src, ydst, width);
02496         RENAME(extract_odd2)(src, udst, vdst, chromWidth);
02497 
02498         src += srcStride;
02499         ydst+= lumStride;
02500         udst+= chromStride;
02501         vdst+= chromStride;
02502     }
02503     __asm__(
02504             EMMS"       \n\t"
02505             SFENCE"     \n\t"
02506             ::: "memory"
02507         );
02508 }
02509 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02510 
02511 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02512                                  int width, int height,
02513                                  int lumStride, int chromStride, int srcStride)
02514 {
02515     int y;
02516     const int chromWidth= -((-width)>>1);
02517 
02518     for (y=0; y<height; y++) {
02519         RENAME(extract_even)(src+1, ydst, width);
02520         if(y&1) {
02521             RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
02522             udst+= chromStride;
02523             vdst+= chromStride;
02524         }
02525 
02526         src += srcStride;
02527         ydst+= lumStride;
02528     }
02529     __asm__(
02530             EMMS"       \n\t"
02531             SFENCE"     \n\t"
02532             ::: "memory"
02533         );
02534 }
02535 
02536 #if !COMPILE_TEMPLATE_AMD3DNOW
02537 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02538                                  int width, int height,
02539                                  int lumStride, int chromStride, int srcStride)
02540 {
02541     int y;
02542     const int chromWidth= -((-width)>>1);
02543 
02544     for (y=0; y<height; y++) {
02545         RENAME(extract_even)(src+1, ydst, width);
02546         RENAME(extract_even2)(src, udst, vdst, chromWidth);
02547 
02548         src += srcStride;
02549         ydst+= lumStride;
02550         udst+= chromStride;
02551         vdst+= chromStride;
02552     }
02553     __asm__(
02554             EMMS"       \n\t"
02555             SFENCE"     \n\t"
02556             ::: "memory"
02557         );
02558 }
02559 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02560 #endif /* !COMPILE_TEMPLATE_SSE2 */
02561 
02562 static inline void RENAME(rgb2rgb_init)(void)
02563 {
02564 #if !COMPILE_TEMPLATE_SSE2
02565 #if !COMPILE_TEMPLATE_AMD3DNOW
02566     rgb15to16          = RENAME(rgb15to16);
02567     rgb15tobgr24       = RENAME(rgb15tobgr24);
02568     rgb15to32          = RENAME(rgb15to32);
02569     rgb16tobgr24       = RENAME(rgb16tobgr24);
02570     rgb16to32          = RENAME(rgb16to32);
02571     rgb16to15          = RENAME(rgb16to15);
02572     rgb24tobgr16       = RENAME(rgb24tobgr16);
02573     rgb24tobgr15       = RENAME(rgb24tobgr15);
02574     rgb24tobgr32       = RENAME(rgb24tobgr32);
02575     rgb32to16          = RENAME(rgb32to16);
02576     rgb32to15          = RENAME(rgb32to15);
02577     rgb32tobgr24       = RENAME(rgb32tobgr24);
02578     rgb24to15          = RENAME(rgb24to15);
02579     rgb24to16          = RENAME(rgb24to16);
02580     rgb24tobgr24       = RENAME(rgb24tobgr24);
02581     shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
02582     rgb32tobgr16       = RENAME(rgb32tobgr16);
02583     rgb32tobgr15       = RENAME(rgb32tobgr15);
02584     yv12toyuy2         = RENAME(yv12toyuy2);
02585     yv12touyvy         = RENAME(yv12touyvy);
02586     yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
02587     yuv422ptouyvy      = RENAME(yuv422ptouyvy);
02588     yuy2toyv12         = RENAME(yuy2toyv12);
02589     vu9_to_vu12        = RENAME(vu9_to_vu12);
02590     yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
02591     uyvytoyuv422       = RENAME(uyvytoyuv422);
02592     yuyvtoyuv422       = RENAME(yuyvtoyuv422);
02593 #endif /* !COMPILE_TEMPLATE_SSE2 */
02594 
02595 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
02596     planar2x           = RENAME(planar2x);
02597 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
02598     rgb24toyv12        = RENAME(rgb24toyv12);
02599 
02600     yuyvtoyuv420       = RENAME(yuyvtoyuv420);
02601     uyvytoyuv420       = RENAME(uyvytoyuv420);
02602 #endif /* COMPILE_TEMPLATE_SSE2 */
02603 
02604 #if !COMPILE_TEMPLATE_AMD3DNOW
02605     interleaveBytes    = RENAME(interleaveBytes);
02606 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02607 }