Libav 0.7.1
|
00001 /* 00002 * Alpha optimized DSP utils 00003 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 00004 * 00005 * This file is part of Libav. 00006 * 00007 * Libav is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * Libav is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with Libav; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00022 #include "libavcodec/dsputil.h" 00023 #include "dsputil_alpha.h" 00024 #include "asm.h" 00025 00026 void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 00027 int line_size); 00028 void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 00029 int line_size); 00030 00031 #if 0 00032 /* These functions were the base for the optimized assembler routines, 00033 and remain here for documentation purposes. */ 00034 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 00035 int line_size) 00036 { 00037 int i = 8; 00038 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 00039 00040 do { 00041 uint64_t shorts0, shorts1; 00042 00043 shorts0 = ldq(block); 00044 shorts0 = maxsw4(shorts0, 0); 00045 shorts0 = minsw4(shorts0, clampmask); 00046 stl(pkwb(shorts0), pixels); 00047 00048 shorts1 = ldq(block + 4); 00049 shorts1 = maxsw4(shorts1, 0); 00050 shorts1 = minsw4(shorts1, clampmask); 00051 stl(pkwb(shorts1), pixels + 4); 00052 00053 pixels += line_size; 00054 block += 8; 00055 } while (--i); 00056 } 00057 00058 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 00059 int line_size) 00060 { 00061 int h = 8; 00062 /* Keep this function a leaf function by generating the constants 00063 manually (mainly for the hack value ;-). */ 00064 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 00065 uint64_t signmask = zap(-1, 0x33); 00066 signmask ^= signmask >> 1; /* 0x8000800080008000 */ 00067 00068 do { 00069 uint64_t shorts0, pix0, signs0; 00070 uint64_t shorts1, pix1, signs1; 00071 00072 shorts0 = ldq(block); 00073 shorts1 = ldq(block + 4); 00074 00075 pix0 = unpkbw(ldl(pixels)); 00076 /* Signed subword add (MMX paddw). */ 00077 signs0 = shorts0 & signmask; 00078 shorts0 &= ~signmask; 00079 shorts0 += pix0; 00080 shorts0 ^= signs0; 00081 /* Clamp. */ 00082 shorts0 = maxsw4(shorts0, 0); 00083 shorts0 = minsw4(shorts0, clampmask); 00084 00085 /* Next 4. */ 00086 pix1 = unpkbw(ldl(pixels + 4)); 00087 signs1 = shorts1 & signmask; 00088 shorts1 &= ~signmask; 00089 shorts1 += pix1; 00090 shorts1 ^= signs1; 00091 shorts1 = maxsw4(shorts1, 0); 00092 shorts1 = minsw4(shorts1, clampmask); 00093 00094 stl(pkwb(shorts0), pixels); 00095 stl(pkwb(shorts1), pixels + 4); 00096 00097 pixels += line_size; 00098 block += 8; 00099 } while (--h); 00100 } 00101 #endif 00102 00103 static void clear_blocks_axp(DCTELEM *blocks) { 00104 uint64_t *p = (uint64_t *) blocks; 00105 int n = sizeof(DCTELEM) * 6 * 64; 00106 00107 do { 00108 p[0] = 0; 00109 p[1] = 0; 00110 p[2] = 0; 00111 p[3] = 0; 00112 p[4] = 0; 00113 p[5] = 0; 00114 p[6] = 0; 00115 p[7] = 0; 00116 p += 8; 00117 n -= 8 * 8; 00118 } while (n); 00119 } 00120 00121 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) 00122 { 00123 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 00124 } 00125 00126 static inline uint64_t avg2(uint64_t a, uint64_t b) 00127 { 00128 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 00129 } 00130 00131 #if 0 00132 /* The XY2 routines basically utilize this scheme, but reuse parts in 00133 each iteration. */ 00134 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 00135 { 00136 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 00137 + ((l2 & ~BYTE_VEC(0x03)) >> 2) 00138 + ((l3 & ~BYTE_VEC(0x03)) >> 2) 00139 + ((l4 & ~BYTE_VEC(0x03)) >> 2); 00140 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 00141 + (l2 & BYTE_VEC(0x03)) 00142 + (l3 & BYTE_VEC(0x03)) 00143 + (l4 & BYTE_VEC(0x03)) 00144 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 00145 return r1 + r2; 00146 } 00147 #endif 00148 00149 #define OP(LOAD, STORE) \ 00150 do { \ 00151 STORE(LOAD(pixels), block); \ 00152 pixels += line_size; \ 00153 block += line_size; \ 00154 } while (--h) 00155 00156 #define OP_X2(LOAD, STORE) \ 00157 do { \ 00158 uint64_t pix1, pix2; \ 00159 \ 00160 pix1 = LOAD(pixels); \ 00161 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 00162 STORE(AVG2(pix1, pix2), block); \ 00163 pixels += line_size; \ 00164 block += line_size; \ 00165 } while (--h) 00166 00167 #define OP_Y2(LOAD, STORE) \ 00168 do { \ 00169 uint64_t pix = LOAD(pixels); \ 00170 do { \ 00171 uint64_t next_pix; \ 00172 \ 00173 pixels += line_size; \ 00174 next_pix = LOAD(pixels); \ 00175 STORE(AVG2(pix, next_pix), block); \ 00176 block += line_size; \ 00177 pix = next_pix; \ 00178 } while (--h); \ 00179 } while (0) 00180 00181 #define OP_XY2(LOAD, STORE) \ 00182 do { \ 00183 uint64_t pix1 = LOAD(pixels); \ 00184 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 00185 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ 00186 + (pix2 & BYTE_VEC(0x03)); \ 00187 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ 00188 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ 00189 \ 00190 do { \ 00191 uint64_t npix1, npix2; \ 00192 uint64_t npix_l, npix_h; \ 00193 uint64_t avg; \ 00194 \ 00195 pixels += line_size; \ 00196 npix1 = LOAD(pixels); \ 00197 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 00198 npix_l = (npix1 & BYTE_VEC(0x03)) \ 00199 + (npix2 & BYTE_VEC(0x03)); \ 00200 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ 00201 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ 00202 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ 00203 + pix_h + npix_h; \ 00204 STORE(avg, block); \ 00205 \ 00206 block += line_size; \ 00207 pix_l = npix_l; \ 00208 pix_h = npix_h; \ 00209 } while (--h); \ 00210 } while (0) 00211 00212 #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ 00213 static void OPNAME ## _pixels ## SUFF ## _axp \ 00214 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 00215 int line_size, int h) \ 00216 { \ 00217 if ((size_t) pixels & 0x7) { \ 00218 OPKIND(uldq, STORE); \ 00219 } else { \ 00220 OPKIND(ldq, STORE); \ 00221 } \ 00222 } \ 00223 \ 00224 static void OPNAME ## _pixels16 ## SUFF ## _axp \ 00225 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 00226 int line_size, int h) \ 00227 { \ 00228 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ 00229 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ 00230 } 00231 00232 #define PIXOP(OPNAME, STORE) \ 00233 MAKE_OP(OPNAME, , OP, STORE) \ 00234 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ 00235 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ 00236 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) 00237 00238 /* Rounding primitives. */ 00239 #define AVG2 avg2 00240 #define AVG4 avg4 00241 #define AVG4_ROUNDER BYTE_VEC(0x02) 00242 #define STORE(l, b) stq(l, b) 00243 PIXOP(put, STORE); 00244 00245 #undef STORE 00246 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); 00247 PIXOP(avg, STORE); 00248 00249 /* Not rounding primitives. */ 00250 #undef AVG2 00251 #undef AVG4 00252 #undef AVG4_ROUNDER 00253 #undef STORE 00254 #define AVG2 avg2_no_rnd 00255 #define AVG4 avg4_no_rnd 00256 #define AVG4_ROUNDER BYTE_VEC(0x01) 00257 #define STORE(l, b) stq(l, b) 00258 PIXOP(put_no_rnd, STORE); 00259 00260 #undef STORE 00261 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); 00262 PIXOP(avg_no_rnd, STORE); 00263 00264 static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, 00265 int line_size, int h) 00266 { 00267 put_pixels_axp_asm(block, pixels, line_size, h); 00268 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); 00269 } 00270 00271 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx) 00272 { 00273 const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8; 00274 00275 if (!high_bit_depth) { 00276 c->put_pixels_tab[0][0] = put_pixels16_axp_asm; 00277 c->put_pixels_tab[0][1] = put_pixels16_x2_axp; 00278 c->put_pixels_tab[0][2] = put_pixels16_y2_axp; 00279 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; 00280 00281 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; 00282 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; 00283 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; 00284 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; 00285 00286 c->avg_pixels_tab[0][0] = avg_pixels16_axp; 00287 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; 00288 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; 00289 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; 00290 00291 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp; 00292 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp; 00293 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp; 00294 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp; 00295 00296 c->put_pixels_tab[1][0] = put_pixels_axp_asm; 00297 c->put_pixels_tab[1][1] = put_pixels_x2_axp; 00298 c->put_pixels_tab[1][2] = put_pixels_y2_axp; 00299 c->put_pixels_tab[1][3] = put_pixels_xy2_axp; 00300 00301 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; 00302 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; 00303 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; 00304 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; 00305 00306 c->avg_pixels_tab[1][0] = avg_pixels_axp; 00307 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; 00308 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; 00309 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; 00310 00311 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp; 00312 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp; 00313 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp; 00314 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp; 00315 00316 c->clear_blocks = clear_blocks_axp; 00317 } 00318 00319 /* amask clears all bits that correspond to present features. */ 00320 if (amask(AMASK_MVI) == 0) { 00321 c->put_pixels_clamped = put_pixels_clamped_mvi_asm; 00322 c->add_pixels_clamped = add_pixels_clamped_mvi_asm; 00323 00324 c->get_pixels = get_pixels_mvi; 00325 c->diff_pixels = diff_pixels_mvi; 00326 c->sad[0] = pix_abs16x16_mvi_asm; 00327 c->sad[1] = pix_abs8x8_mvi; 00328 c->pix_abs[0][0] = pix_abs16x16_mvi_asm; 00329 c->pix_abs[1][0] = pix_abs8x8_mvi; 00330 c->pix_abs[0][1] = pix_abs16x16_x2_mvi; 00331 c->pix_abs[0][2] = pix_abs16x16_y2_mvi; 00332 c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; 00333 } 00334 00335 put_pixels_clamped_axp_p = c->put_pixels_clamped; 00336 add_pixels_clamped_axp_p = c->add_pixels_clamped; 00337 00338 if (!avctx->lowres && 00339 (avctx->idct_algo == FF_IDCT_AUTO || 00340 avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) { 00341 c->idct_put = ff_simple_idct_put_axp; 00342 c->idct_add = ff_simple_idct_add_axp; 00343 c->idct = ff_simple_idct_axp; 00344 } 00345 }