Libav 0.7.1
|
00001 /* 00002 * Windows Media Audio Voice decoder. 00003 * Copyright (c) 2009 Ronald S. Bultje 00004 * 00005 * This file is part of Libav. 00006 * 00007 * Libav is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * Libav is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with Libav; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00028 #include <math.h> 00029 #include "avcodec.h" 00030 #include "get_bits.h" 00031 #include "put_bits.h" 00032 #include "wmavoice_data.h" 00033 #include "celp_math.h" 00034 #include "celp_filters.h" 00035 #include "acelp_vectors.h" 00036 #include "acelp_filters.h" 00037 #include "lsp.h" 00038 #include "libavutil/lzo.h" 00039 #include "dct.h" 00040 #include "rdft.h" 00041 #include "sinewin.h" 00042 00043 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame 00044 #define MAX_LSPS 16 ///< maximum filter order 00045 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple 00046 00047 #define MAX_FRAMES 3 ///< maximum number of frames per superframe 00048 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame 00049 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history 00050 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) 00051 00052 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that 00053 00054 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration 00055 00059 static VLC frame_type_vlc; 00060 00064 enum { 00065 ACB_TYPE_NONE = 0, 00066 ACB_TYPE_ASYMMETRIC = 1, 00067 00068 00069 00070 00071 ACB_TYPE_HAMMING = 2 00072 00073 00074 }; 00075 00079 enum { 00080 FCB_TYPE_SILENCE = 0, 00081 00082 00083 FCB_TYPE_HARDCODED = 1, 00084 00085 FCB_TYPE_AW_PULSES = 2, 00086 00087 FCB_TYPE_EXC_PULSES = 3, 00088 00089 00090 }; 00091 00095 static const struct frame_type_desc { 00096 uint8_t n_blocks; 00097 00098 uint8_t log_n_blocks; 00099 uint8_t acb_type; 00100 uint8_t fcb_type; 00101 uint8_t dbl_pulses; 00102 00103 00104 uint16_t frame_size; 00105 00106 } frame_descs[17] = { 00107 { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 }, 00108 { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 }, 00109 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 }, 00110 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 }, 00111 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 }, 00112 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 }, 00113 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 }, 00114 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 }, 00115 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 }, 00116 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 }, 00117 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 }, 00118 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 }, 00119 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 }, 00120 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 }, 00121 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 }, 00122 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 }, 00123 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 } 00124 }; 00125 00129 typedef struct { 00134 GetBitContext gb; 00135 00136 00137 00138 int8_t vbm_tree[25]; 00139 00140 int spillover_bitsize; 00141 00142 00143 int history_nsamples; 00144 00145 00146 /* postfilter specific values */ 00147 int do_apf; 00148 00149 int denoise_strength; 00150 00151 int denoise_tilt_corr; 00152 00153 int dc_level; 00154 00155 00156 int lsps; 00157 int lsp_q_mode; 00158 int lsp_def_mode; 00159 00160 int frame_lsp_bitsize; 00161 00162 int sframe_lsp_bitsize; 00163 00164 00165 int min_pitch_val; 00166 int max_pitch_val; 00167 int pitch_nbits; 00168 00169 int block_pitch_nbits; 00170 00171 int block_pitch_range; 00172 int block_delta_pitch_nbits; 00173 00174 00175 00176 int block_delta_pitch_hrange; 00177 00178 uint16_t block_conv_table[4]; 00179 00180 00190 int spillover_nbits; 00191 00192 00193 00194 int has_residual_lsps; 00195 00196 00197 00198 00199 int skip_bits_next; 00200 00201 00202 00203 uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE]; 00206 int sframe_cache_size; 00207 00208 00209 00210 00211 PutBitContext pb; 00212 00222 double prev_lsps[MAX_LSPS]; 00223 00224 int last_pitch_val; 00225 int last_acb_type; 00226 int pitch_diff_sh16; 00227 00228 float silence_gain; 00229 00230 int aw_idx_is_ext; 00231 00232 int aw_pulse_range; 00233 00234 00235 00236 00237 00238 int aw_n_pulses[2]; 00239 00240 00241 int aw_first_pulse_off[2]; 00242 00243 int aw_next_pulse_off_cache; 00244 00245 00246 00247 00248 00249 int frame_cntr; 00250 00251 float gain_pred_err[6]; 00252 float excitation_history[MAX_SIGNAL_HISTORY]; 00256 float synth_history[MAX_LSPS]; 00257 00266 RDFTContext rdft, irdft; 00267 00268 DCTContext dct, dst; 00269 00270 float sin[511], cos[511]; 00271 00272 float postfilter_agc; 00273 00274 float dcf_mem[2]; 00275 float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE]; 00278 float denoise_filter_cache[MAX_FRAMESIZE]; 00279 int denoise_filter_cache_size; 00280 DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80]; 00282 DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80]; 00284 DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16]; 00287 00290 } WMAVoiceContext; 00291 00301 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25]) 00302 { 00303 static const uint8_t bits[] = { 00304 2, 2, 2, 4, 4, 4, 00305 6, 6, 6, 8, 8, 8, 00306 10, 10, 10, 12, 12, 12, 00307 14, 14, 14, 14 00308 }; 00309 static const uint16_t codes[] = { 00310 0x0000, 0x0001, 0x0002, // 00/01/10 00311 0x000c, 0x000d, 0x000e, // 11+00/01/10 00312 0x003c, 0x003d, 0x003e, // 1111+00/01/10 00313 0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10 00314 0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10 00315 0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10 00316 0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx 00317 }; 00318 int cntr[8], n, res; 00319 00320 memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25); 00321 memset(cntr, 0, sizeof(cntr)); 00322 for (n = 0; n < 17; n++) { 00323 res = get_bits(gb, 3); 00324 if (cntr[res] > 3) // should be >= 3 + (res == 7)) 00325 return -1; 00326 vbm_tree[res * 3 + cntr[res]++] = n; 00327 } 00328 INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits), 00329 bits, 1, 1, codes, 2, 2, 132); 00330 return 0; 00331 } 00332 00336 static av_cold int wmavoice_decode_init(AVCodecContext *ctx) 00337 { 00338 int n, flags, pitch_range, lsp16_flag; 00339 WMAVoiceContext *s = ctx->priv_data; 00340 00349 if (ctx->extradata_size != 46) { 00350 av_log(ctx, AV_LOG_ERROR, 00351 "Invalid extradata size %d (should be 46)\n", 00352 ctx->extradata_size); 00353 return -1; 00354 } 00355 flags = AV_RL32(ctx->extradata + 18); 00356 s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align); 00357 s->do_apf = flags & 0x1; 00358 if (s->do_apf) { 00359 ff_rdft_init(&s->rdft, 7, DFT_R2C); 00360 ff_rdft_init(&s->irdft, 7, IDFT_C2R); 00361 ff_dct_init(&s->dct, 6, DCT_I); 00362 ff_dct_init(&s->dst, 6, DST_I); 00363 00364 ff_sine_window_init(s->cos, 256); 00365 memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0])); 00366 for (n = 0; n < 255; n++) { 00367 s->sin[n] = -s->sin[510 - n]; 00368 s->cos[510 - n] = s->cos[n]; 00369 } 00370 } 00371 s->denoise_strength = (flags >> 2) & 0xF; 00372 if (s->denoise_strength >= 12) { 00373 av_log(ctx, AV_LOG_ERROR, 00374 "Invalid denoise filter strength %d (max=11)\n", 00375 s->denoise_strength); 00376 return -1; 00377 } 00378 s->denoise_tilt_corr = !!(flags & 0x40); 00379 s->dc_level = (flags >> 7) & 0xF; 00380 s->lsp_q_mode = !!(flags & 0x2000); 00381 s->lsp_def_mode = !!(flags & 0x4000); 00382 lsp16_flag = flags & 0x1000; 00383 if (lsp16_flag) { 00384 s->lsps = 16; 00385 s->frame_lsp_bitsize = 34; 00386 s->sframe_lsp_bitsize = 60; 00387 } else { 00388 s->lsps = 10; 00389 s->frame_lsp_bitsize = 24; 00390 s->sframe_lsp_bitsize = 48; 00391 } 00392 for (n = 0; n < s->lsps; n++) 00393 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0); 00394 00395 init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3); 00396 if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) { 00397 av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n"); 00398 return -1; 00399 } 00400 00401 s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8; 00402 s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8; 00403 pitch_range = s->max_pitch_val - s->min_pitch_val; 00404 s->pitch_nbits = av_ceil_log2(pitch_range); 00405 s->last_pitch_val = 40; 00406 s->last_acb_type = ACB_TYPE_NONE; 00407 s->history_nsamples = s->max_pitch_val + 8; 00408 00409 if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) { 00410 int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8, 00411 max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8; 00412 00413 av_log(ctx, AV_LOG_ERROR, 00414 "Unsupported samplerate %d (min=%d, max=%d)\n", 00415 ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz 00416 00417 return -1; 00418 } 00419 00420 s->block_conv_table[0] = s->min_pitch_val; 00421 s->block_conv_table[1] = (pitch_range * 25) >> 6; 00422 s->block_conv_table[2] = (pitch_range * 44) >> 6; 00423 s->block_conv_table[3] = s->max_pitch_val - 1; 00424 s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF; 00425 s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange); 00426 s->block_pitch_range = s->block_conv_table[2] + 00427 s->block_conv_table[3] + 1 + 00428 2 * (s->block_conv_table[1] - 2 * s->min_pitch_val); 00429 s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range); 00430 00431 ctx->sample_fmt = AV_SAMPLE_FMT_FLT; 00432 00433 return 0; 00434 } 00435 00457 static void adaptive_gain_control(float *out, const float *in, 00458 const float *speech_synth, 00459 int size, float alpha, float *gain_mem) 00460 { 00461 int i; 00462 float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor; 00463 float mem = *gain_mem; 00464 00465 for (i = 0; i < size; i++) { 00466 speech_energy += fabsf(speech_synth[i]); 00467 postfilter_energy += fabsf(in[i]); 00468 } 00469 gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy; 00470 00471 for (i = 0; i < size; i++) { 00472 mem = alpha * mem + gain_scale_factor; 00473 out[i] = in[i] * mem; 00474 } 00475 00476 *gain_mem = mem; 00477 } 00478 00497 static int kalman_smoothen(WMAVoiceContext *s, int pitch, 00498 const float *in, float *out, int size) 00499 { 00500 int n; 00501 float optimal_gain = 0, dot; 00502 const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)], 00503 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)], 00504 *best_hist_ptr; 00505 00506 /* find best fitting point in history */ 00507 do { 00508 dot = ff_dot_productf(in, ptr, size); 00509 if (dot > optimal_gain) { 00510 optimal_gain = dot; 00511 best_hist_ptr = ptr; 00512 } 00513 } while (--ptr >= end); 00514 00515 if (optimal_gain <= 0) 00516 return -1; 00517 dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size); 00518 if (dot <= 0) // would be 1.0 00519 return -1; 00520 00521 if (optimal_gain <= dot) { 00522 dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000 00523 } else 00524 dot = 0.625; 00525 00526 /* actual smoothing */ 00527 for (n = 0; n < size; n++) 00528 out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]); 00529 00530 return 0; 00531 } 00532 00543 static float tilt_factor(const float *lpcs, int n_lpcs) 00544 { 00545 float rh0, rh1; 00546 00547 rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs); 00548 rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1); 00549 00550 return rh1 / rh0; 00551 } 00552 00556 static void calc_input_response(WMAVoiceContext *s, float *lpcs, 00557 int fcb_type, float *coeffs, int remainder) 00558 { 00559 float last_coeff, min = 15.0, max = -15.0; 00560 float irange, angle_mul, gain_mul, range, sq; 00561 int n, idx; 00562 00563 /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */ 00564 s->rdft.rdft_calc(&s->rdft, lpcs); 00565 #define log_range(var, assign) do { \ 00566 float tmp = log10f(assign); var = tmp; \ 00567 max = FFMAX(max, tmp); min = FFMIN(min, tmp); \ 00568 } while (0) 00569 log_range(last_coeff, lpcs[1] * lpcs[1]); 00570 for (n = 1; n < 64; n++) 00571 log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] + 00572 lpcs[n * 2 + 1] * lpcs[n * 2 + 1]); 00573 log_range(lpcs[0], lpcs[0] * lpcs[0]); 00574 #undef log_range 00575 range = max - min; 00576 lpcs[64] = last_coeff; 00577 00578 /* Now, use this spectrum to pick out these frequencies with higher 00579 * (relative) power/energy (which we then take to be "not noise"), 00580 * and set up a table (still in lpc[]) of (relative) gains per frequency. 00581 * These frequencies will be maintained, while others ("noise") will be 00582 * decreased in the filter output. */ 00583 irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63] 00584 gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) : 00585 (5.0 / 14.7)); 00586 angle_mul = gain_mul * (8.0 * M_LN10 / M_PI); 00587 for (n = 0; n <= 64; n++) { 00588 float pwr; 00589 00590 idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1); 00591 pwr = wmavoice_denoise_power_table[s->denoise_strength][idx]; 00592 lpcs[n] = angle_mul * pwr; 00593 00594 /* 70.57 =~ 1/log10(1.0331663) */ 00595 idx = (pwr * gain_mul - 0.0295) * 70.570526123; 00596 if (idx > 127) { // fallback if index falls outside table range 00597 coeffs[n] = wmavoice_energy_table[127] * 00598 powf(1.0331663, idx - 127); 00599 } else 00600 coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)]; 00601 } 00602 00603 /* calculate the Hilbert transform of the gains, which we do (since this 00604 * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()). 00605 * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the 00606 * "moment" of the LPCs in this filter. */ 00607 s->dct.dct_calc(&s->dct, lpcs); 00608 s->dst.dct_calc(&s->dst, lpcs); 00609 00610 /* Split out the coefficient indexes into phase/magnitude pairs */ 00611 idx = 255 + av_clip(lpcs[64], -255, 255); 00612 coeffs[0] = coeffs[0] * s->cos[idx]; 00613 idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255); 00614 last_coeff = coeffs[64] * s->cos[idx]; 00615 for (n = 63;; n--) { 00616 idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255); 00617 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx]; 00618 coeffs[n * 2] = coeffs[n] * s->cos[idx]; 00619 00620 if (!--n) break; 00621 00622 idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255); 00623 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx]; 00624 coeffs[n * 2] = coeffs[n] * s->cos[idx]; 00625 } 00626 coeffs[1] = last_coeff; 00627 00628 /* move into real domain */ 00629 s->irdft.rdft_calc(&s->irdft, coeffs); 00630 00631 /* tilt correction and normalize scale */ 00632 memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder)); 00633 if (s->denoise_tilt_corr) { 00634 float tilt_mem = 0; 00635 00636 coeffs[remainder - 1] = 0; 00637 ff_tilt_compensation(&tilt_mem, 00638 -1.8 * tilt_factor(coeffs, remainder - 1), 00639 coeffs, remainder); 00640 } 00641 sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder)); 00642 for (n = 0; n < remainder; n++) 00643 coeffs[n] *= sq; 00644 } 00645 00672 static void wiener_denoise(WMAVoiceContext *s, int fcb_type, 00673 float *synth_pf, int size, 00674 const float *lpcs) 00675 { 00676 int remainder, lim, n; 00677 00678 if (fcb_type != FCB_TYPE_SILENCE) { 00679 float *tilted_lpcs = s->tilted_lpcs_pf, 00680 *coeffs = s->denoise_coeffs_pf, tilt_mem = 0; 00681 00682 tilted_lpcs[0] = 1.0; 00683 memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps); 00684 memset(&tilted_lpcs[s->lsps + 1], 0, 00685 sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1)); 00686 ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps), 00687 tilted_lpcs, s->lsps + 2); 00688 00689 /* The IRDFT output (127 samples for 7-bit filter) beyond the frame 00690 * size is applied to the next frame. All input beyond this is zero, 00691 * and thus all output beyond this will go towards zero, hence we can 00692 * limit to min(size-1, 127-size) as a performance consideration. */ 00693 remainder = FFMIN(127 - size, size - 1); 00694 calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder); 00695 00696 /* apply coefficients (in frequency spectrum domain), i.e. complex 00697 * number multiplication */ 00698 memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size)); 00699 s->rdft.rdft_calc(&s->rdft, synth_pf); 00700 s->rdft.rdft_calc(&s->rdft, coeffs); 00701 synth_pf[0] *= coeffs[0]; 00702 synth_pf[1] *= coeffs[1]; 00703 for (n = 1; n < 64; n++) { 00704 float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1]; 00705 synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1]; 00706 synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1]; 00707 } 00708 s->irdft.rdft_calc(&s->irdft, synth_pf); 00709 } 00710 00711 /* merge filter output with the history of previous runs */ 00712 if (s->denoise_filter_cache_size) { 00713 lim = FFMIN(s->denoise_filter_cache_size, size); 00714 for (n = 0; n < lim; n++) 00715 synth_pf[n] += s->denoise_filter_cache[n]; 00716 s->denoise_filter_cache_size -= lim; 00717 memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size], 00718 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size); 00719 } 00720 00721 /* move remainder of filter output into a cache for future runs */ 00722 if (fcb_type != FCB_TYPE_SILENCE) { 00723 lim = FFMIN(remainder, s->denoise_filter_cache_size); 00724 for (n = 0; n < lim; n++) 00725 s->denoise_filter_cache[n] += synth_pf[size + n]; 00726 if (lim < remainder) { 00727 memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim], 00728 sizeof(s->denoise_filter_cache[0]) * (remainder - lim)); 00729 s->denoise_filter_cache_size = remainder; 00730 } 00731 } 00732 } 00733 00754 static void postfilter(WMAVoiceContext *s, const float *synth, 00755 float *samples, int size, 00756 const float *lpcs, float *zero_exc_pf, 00757 int fcb_type, int pitch) 00758 { 00759 float synth_filter_in_buf[MAX_FRAMESIZE / 2], 00760 *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16], 00761 *synth_filter_in = zero_exc_pf; 00762 00763 assert(size <= MAX_FRAMESIZE / 2); 00764 00765 /* generate excitation from input signal */ 00766 ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps); 00767 00768 if (fcb_type >= FCB_TYPE_AW_PULSES && 00769 !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size)) 00770 synth_filter_in = synth_filter_in_buf; 00771 00772 /* re-synthesize speech after smoothening, and keep history */ 00773 ff_celp_lp_synthesis_filterf(synth_pf, lpcs, 00774 synth_filter_in, size, s->lsps); 00775 memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps], 00776 sizeof(synth_pf[0]) * s->lsps); 00777 00778 wiener_denoise(s, fcb_type, synth_pf, size, lpcs); 00779 00780 adaptive_gain_control(samples, synth_pf, synth, size, 0.99, 00781 &s->postfilter_agc); 00782 00783 if (s->dc_level > 8) { 00784 /* remove ultra-low frequency DC noise / highpass filter; 00785 * coefficients are identical to those used in SIPR decoding, 00786 * and very closely resemble those used in AMR-NB decoding. */ 00787 ff_acelp_apply_order_2_transfer_function(samples, samples, 00788 (const float[2]) { -1.99997, 1.0 }, 00789 (const float[2]) { -1.9330735188, 0.93589198496 }, 00790 0.93980580475, s->dcf_mem, size); 00791 } 00792 } 00808 static void dequant_lsps(double *lsps, int num, 00809 const uint16_t *values, 00810 const uint16_t *sizes, 00811 int n_stages, const uint8_t *table, 00812 const double *mul_q, 00813 const double *base_q) 00814 { 00815 int n, m; 00816 00817 memset(lsps, 0, num * sizeof(*lsps)); 00818 for (n = 0; n < n_stages; n++) { 00819 const uint8_t *t_off = &table[values[n] * num]; 00820 double base = base_q[n], mul = mul_q[n]; 00821 00822 for (m = 0; m < num; m++) 00823 lsps[m] += base + mul * t_off[m]; 00824 00825 table += sizes[n] * num; 00826 } 00827 } 00828 00840 static void dequant_lsp10i(GetBitContext *gb, double *lsps) 00841 { 00842 static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 }; 00843 static const double mul_lsf[4] = { 00844 5.2187144800e-3, 1.4626986422e-3, 00845 9.6179549166e-4, 1.1325736225e-3 00846 }; 00847 static const double base_lsf[4] = { 00848 M_PI * -2.15522e-1, M_PI * -6.1646e-2, 00849 M_PI * -3.3486e-2, M_PI * -5.7408e-2 00850 }; 00851 uint16_t v[4]; 00852 00853 v[0] = get_bits(gb, 8); 00854 v[1] = get_bits(gb, 6); 00855 v[2] = get_bits(gb, 5); 00856 v[3] = get_bits(gb, 5); 00857 00858 dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i, 00859 mul_lsf, base_lsf); 00860 } 00861 00866 static void dequant_lsp10r(GetBitContext *gb, 00867 double *i_lsps, const double *old, 00868 double *a1, double *a2, int q_mode) 00869 { 00870 static const uint16_t vec_sizes[3] = { 128, 64, 64 }; 00871 static const double mul_lsf[3] = { 00872 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3 00873 }; 00874 static const double base_lsf[3] = { 00875 M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2 00876 }; 00877 const float (*ipol_tab)[2][10] = q_mode ? 00878 wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a; 00879 uint16_t interpol, v[3]; 00880 int n; 00881 00882 dequant_lsp10i(gb, i_lsps); 00883 00884 interpol = get_bits(gb, 5); 00885 v[0] = get_bits(gb, 7); 00886 v[1] = get_bits(gb, 6); 00887 v[2] = get_bits(gb, 6); 00888 00889 for (n = 0; n < 10; n++) { 00890 double delta = old[n] - i_lsps[n]; 00891 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n]; 00892 a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 00893 } 00894 00895 dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r, 00896 mul_lsf, base_lsf); 00897 } 00898 00902 static void dequant_lsp16i(GetBitContext *gb, double *lsps) 00903 { 00904 static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 }; 00905 static const double mul_lsf[5] = { 00906 3.3439586280e-3, 6.9908173703e-4, 00907 3.3216608306e-3, 1.0334960326e-3, 00908 3.1899104283e-3 00909 }; 00910 static const double base_lsf[5] = { 00911 M_PI * -1.27576e-1, M_PI * -2.4292e-2, 00912 M_PI * -1.28094e-1, M_PI * -3.2128e-2, 00913 M_PI * -1.29816e-1 00914 }; 00915 uint16_t v[5]; 00916 00917 v[0] = get_bits(gb, 8); 00918 v[1] = get_bits(gb, 6); 00919 v[2] = get_bits(gb, 7); 00920 v[3] = get_bits(gb, 6); 00921 v[4] = get_bits(gb, 7); 00922 00923 dequant_lsps( lsps, 5, v, vec_sizes, 2, 00924 wmavoice_dq_lsp16i1, mul_lsf, base_lsf); 00925 dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2, 00926 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]); 00927 dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1, 00928 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]); 00929 } 00930 00935 static void dequant_lsp16r(GetBitContext *gb, 00936 double *i_lsps, const double *old, 00937 double *a1, double *a2, int q_mode) 00938 { 00939 static const uint16_t vec_sizes[3] = { 128, 128, 128 }; 00940 static const double mul_lsf[3] = { 00941 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3 00942 }; 00943 static const double base_lsf[3] = { 00944 M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2 00945 }; 00946 const float (*ipol_tab)[2][16] = q_mode ? 00947 wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a; 00948 uint16_t interpol, v[3]; 00949 int n; 00950 00951 dequant_lsp16i(gb, i_lsps); 00952 00953 interpol = get_bits(gb, 5); 00954 v[0] = get_bits(gb, 7); 00955 v[1] = get_bits(gb, 7); 00956 v[2] = get_bits(gb, 7); 00957 00958 for (n = 0; n < 16; n++) { 00959 double delta = old[n] - i_lsps[n]; 00960 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n]; 00961 a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 00962 } 00963 00964 dequant_lsps( a2, 10, v, vec_sizes, 1, 00965 wmavoice_dq_lsp16r1, mul_lsf, base_lsf); 00966 dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1, 00967 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]); 00968 dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1, 00969 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]); 00970 } 00971 00985 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, 00986 const int *pitch) 00987 { 00988 static const int16_t start_offset[94] = { 00989 -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 00990 13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26, 00991 27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43, 00992 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 00993 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 00994 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 00995 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 00996 141, 143, 145, 147, 149, 151, 153, 155, 157, 159 00997 }; 00998 int bits, offset; 00999 01000 /* position of pulse */ 01001 s->aw_idx_is_ext = 0; 01002 if ((bits = get_bits(gb, 6)) >= 54) { 01003 s->aw_idx_is_ext = 1; 01004 bits += (bits - 54) * 3 + get_bits(gb, 2); 01005 } 01006 01007 /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count 01008 * the distribution of the pulses in each block contained in this frame. */ 01009 s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16; 01010 for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ; 01011 s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0]; 01012 s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2; 01013 offset += s->aw_n_pulses[0] * pitch[0]; 01014 s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1]; 01015 s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2; 01016 01017 /* if continuing from a position before the block, reset position to 01018 * start of block (when corrected for the range over which it can be 01019 * spread in aw_pulse_set1()). */ 01020 if (start_offset[bits] < MAX_FRAMESIZE / 2) { 01021 while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0) 01022 s->aw_first_pulse_off[1] -= pitch[1]; 01023 if (start_offset[bits] < 0) 01024 while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0) 01025 s->aw_first_pulse_off[0] -= pitch[0]; 01026 } 01027 } 01028 01036 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, 01037 int block_idx, AMRFixed *fcb) 01038 { 01039 uint16_t use_mask_mem[9]; // only 5 are used, rest is padding 01040 uint16_t *use_mask = use_mask_mem + 2; 01041 /* in this function, idx is the index in the 80-bit (+ padding) use_mask 01042 * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits 01043 * of idx are the position of the bit within a particular item in the 01044 * array (0 being the most significant bit, and 15 being the least 01045 * significant bit), and the remainder (>> 4) is the index in the 01046 * use_mask[]-array. This is faster and uses less memory than using a 01047 * 80-byte/80-int array. */ 01048 int pulse_off = s->aw_first_pulse_off[block_idx], 01049 pulse_start, n, idx, range, aidx, start_off = 0; 01050 01051 /* set offset of first pulse to within this block */ 01052 if (s->aw_n_pulses[block_idx] > 0) 01053 while (pulse_off + s->aw_pulse_range < 1) 01054 pulse_off += fcb->pitch_lag; 01055 01056 /* find range per pulse */ 01057 if (s->aw_n_pulses[0] > 0) { 01058 if (block_idx == 0) { 01059 range = 32; 01060 } else /* block_idx = 1 */ { 01061 range = 8; 01062 if (s->aw_n_pulses[block_idx] > 0) 01063 pulse_off = s->aw_next_pulse_off_cache; 01064 } 01065 } else 01066 range = 16; 01067 pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0; 01068 01069 /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly, 01070 * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus 01071 * we exclude that range from being pulsed again in this function. */ 01072 memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0])); 01073 memset( use_mask, -1, 5 * sizeof(use_mask[0])); 01074 memset(&use_mask[5], 0, 2 * sizeof(use_mask[0])); 01075 if (s->aw_n_pulses[block_idx] > 0) 01076 for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) { 01077 int excl_range = s->aw_pulse_range; // always 16 or 24 01078 uint16_t *use_mask_ptr = &use_mask[idx >> 4]; 01079 int first_sh = 16 - (idx & 15); 01080 *use_mask_ptr++ &= 0xFFFF << first_sh; 01081 excl_range -= first_sh; 01082 if (excl_range >= 16) { 01083 *use_mask_ptr++ = 0; 01084 *use_mask_ptr &= 0xFFFF >> (excl_range - 16); 01085 } else 01086 *use_mask_ptr &= 0xFFFF >> excl_range; 01087 } 01088 01089 /* find the 'aidx'th offset that is not excluded */ 01090 aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4); 01091 for (n = 0; n <= aidx; pulse_start++) { 01092 for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ; 01093 if (idx >= MAX_FRAMESIZE / 2) { // find from zero 01094 if (use_mask[0]) idx = 0x0F; 01095 else if (use_mask[1]) idx = 0x1F; 01096 else if (use_mask[2]) idx = 0x2F; 01097 else if (use_mask[3]) idx = 0x3F; 01098 else if (use_mask[4]) idx = 0x4F; 01099 else return; 01100 idx -= av_log2_16bit(use_mask[idx >> 4]); 01101 } 01102 if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) { 01103 use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15)); 01104 n++; 01105 start_off = idx; 01106 } 01107 } 01108 01109 fcb->x[fcb->n] = start_off; 01110 fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0; 01111 fcb->n++; 01112 01113 /* set offset for next block, relative to start of that block */ 01114 n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag; 01115 s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0; 01116 } 01117 01125 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, 01126 int block_idx, AMRFixed *fcb) 01127 { 01128 int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx)); 01129 float v; 01130 01131 if (s->aw_n_pulses[block_idx] > 0) { 01132 int n, v_mask, i_mask, sh, n_pulses; 01133 01134 if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each 01135 n_pulses = 3; 01136 v_mask = 8; 01137 i_mask = 7; 01138 sh = 4; 01139 } else { // 4 pulses, 1:sign + 2:index each 01140 n_pulses = 4; 01141 v_mask = 4; 01142 i_mask = 3; 01143 sh = 3; 01144 } 01145 01146 for (n = n_pulses - 1; n >= 0; n--, val >>= sh) { 01147 fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0; 01148 fcb->x[fcb->n] = (val & i_mask) * n_pulses + n + 01149 s->aw_first_pulse_off[block_idx]; 01150 while (fcb->x[fcb->n] < 0) 01151 fcb->x[fcb->n] += fcb->pitch_lag; 01152 if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2) 01153 fcb->n++; 01154 } 01155 } else { 01156 int num2 = (val & 0x1FF) >> 1, delta, idx; 01157 01158 if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; } 01159 else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; } 01160 else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; } 01161 else { delta = 7; idx = num2 + 1 - 3 * 75; } 01162 v = (val & 0x200) ? -1.0 : 1.0; 01163 01164 fcb->no_repeat_mask |= 3 << fcb->n; 01165 fcb->x[fcb->n] = idx - delta; 01166 fcb->y[fcb->n] = v; 01167 fcb->x[fcb->n + 1] = idx; 01168 fcb->y[fcb->n + 1] = (val & 1) ? -v : v; 01169 fcb->n += 2; 01170 } 01171 } 01172 01186 static int pRNG(int frame_cntr, int block_num, int block_size) 01187 { 01188 /* array to simplify the calculation of z: 01189 * y = (x % 9) * 5 + 6; 01190 * z = (49995 * x) / y; 01191 * Since y only has 9 values, we can remove the division by using a 01192 * LUT and using FASTDIV-style divisions. For each of the 9 values 01193 * of y, we can rewrite z as: 01194 * z = x * (49995 / y) + x * ((49995 % y) / y) 01195 * In this table, each col represents one possible value of y, the 01196 * first number is 49995 / y, and the second is the FASTDIV variant 01197 * of 49995 % y / y. */ 01198 static const unsigned int div_tbl[9][2] = { 01199 { 8332, 3 * 715827883U }, // y = 6 01200 { 4545, 0 * 390451573U }, // y = 11 01201 { 3124, 11 * 268435456U }, // y = 16 01202 { 2380, 15 * 204522253U }, // y = 21 01203 { 1922, 23 * 165191050U }, // y = 26 01204 { 1612, 23 * 138547333U }, // y = 31 01205 { 1388, 27 * 119304648U }, // y = 36 01206 { 1219, 16 * 104755300U }, // y = 41 01207 { 1086, 39 * 93368855U } // y = 46 01208 }; 01209 unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr; 01210 if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6, 01211 // so this is effectively a modulo (%) 01212 y = x - 9 * MULH(477218589, x); // x % 9 01213 z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1])); 01214 // z = x * 49995 / (y * 5 + 6) 01215 return z % (1000 - block_size); 01216 } 01217 01222 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, 01223 int block_idx, int size, 01224 const struct frame_type_desc *frame_desc, 01225 float *excitation) 01226 { 01227 float gain; 01228 int n, r_idx; 01229 01230 assert(size <= MAX_FRAMESIZE); 01231 01232 /* Set the offset from which we start reading wmavoice_std_codebook */ 01233 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { 01234 r_idx = pRNG(s->frame_cntr, block_idx, size); 01235 gain = s->silence_gain; 01236 } else /* FCB_TYPE_HARDCODED */ { 01237 r_idx = get_bits(gb, 8); 01238 gain = wmavoice_gain_universal[get_bits(gb, 6)]; 01239 } 01240 01241 /* Clear gain prediction parameters */ 01242 memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err)); 01243 01244 /* Apply gain to hardcoded codebook and use that as excitation signal */ 01245 for (n = 0; n < size; n++) 01246 excitation[n] = wmavoice_std_codebook[r_idx + n] * gain; 01247 } 01248 01253 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, 01254 int block_idx, int size, 01255 int block_pitch_sh2, 01256 const struct frame_type_desc *frame_desc, 01257 float *excitation) 01258 { 01259 static const float gain_coeff[6] = { 01260 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458 01261 }; 01262 float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain; 01263 int n, idx, gain_weight; 01264 AMRFixed fcb; 01265 01266 assert(size <= MAX_FRAMESIZE / 2); 01267 memset(pulses, 0, sizeof(*pulses) * size); 01268 01269 fcb.pitch_lag = block_pitch_sh2 >> 2; 01270 fcb.pitch_fac = 1.0; 01271 fcb.no_repeat_mask = 0; 01272 fcb.n = 0; 01273 01274 /* For the other frame types, this is where we apply the innovation 01275 * (fixed) codebook pulses of the speech signal. */ 01276 if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01277 aw_pulse_set1(s, gb, block_idx, &fcb); 01278 aw_pulse_set2(s, gb, block_idx, &fcb); 01279 } else /* FCB_TYPE_EXC_PULSES */ { 01280 int offset_nbits = 5 - frame_desc->log_n_blocks; 01281 01282 fcb.no_repeat_mask = -1; 01283 /* similar to ff_decode_10_pulses_35bits(), but with single pulses 01284 * (instead of double) for a subset of pulses */ 01285 for (n = 0; n < 5; n++) { 01286 float sign; 01287 int pos1, pos2; 01288 01289 sign = get_bits1(gb) ? 1.0 : -1.0; 01290 pos1 = get_bits(gb, offset_nbits); 01291 fcb.x[fcb.n] = n + 5 * pos1; 01292 fcb.y[fcb.n++] = sign; 01293 if (n < frame_desc->dbl_pulses) { 01294 pos2 = get_bits(gb, offset_nbits); 01295 fcb.x[fcb.n] = n + 5 * pos2; 01296 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign; 01297 } 01298 } 01299 } 01300 ff_set_fixed_vector(pulses, &fcb, 1.0, size); 01301 01302 /* Calculate gain for adaptive & fixed codebook signal. 01303 * see ff_amr_set_fixed_gain(). */ 01304 idx = get_bits(gb, 7); 01305 fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) - 01306 5.2409161640 + wmavoice_gain_codebook_fcb[idx]); 01307 acb_gain = wmavoice_gain_codebook_acb[idx]; 01308 pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx], 01309 -2.9957322736 /* log(0.05) */, 01310 1.6094379124 /* log(5.0) */); 01311 01312 gain_weight = 8 >> frame_desc->log_n_blocks; 01313 memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err, 01314 sizeof(*s->gain_pred_err) * (6 - gain_weight)); 01315 for (n = 0; n < gain_weight; n++) 01316 s->gain_pred_err[n] = pred_err; 01317 01318 /* Calculation of adaptive codebook */ 01319 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { 01320 int len; 01321 for (n = 0; n < size; n += len) { 01322 int next_idx_sh16; 01323 int abs_idx = block_idx * size + n; 01324 int pitch_sh16 = (s->last_pitch_val << 16) + 01325 s->pitch_diff_sh16 * abs_idx; 01326 int pitch = (pitch_sh16 + 0x6FFF) >> 16; 01327 int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000; 01328 idx = idx_sh16 >> 16; 01329 if (s->pitch_diff_sh16) { 01330 if (s->pitch_diff_sh16 > 0) { 01331 next_idx_sh16 = (idx_sh16) &~ 0xFFFF; 01332 } else 01333 next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF; 01334 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8, 01335 1, size - n); 01336 } else 01337 len = size; 01338 01339 ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch], 01340 wmavoice_ipol1_coeffs, 17, 01341 idx, 9, len); 01342 } 01343 } else /* ACB_TYPE_HAMMING */ { 01344 int block_pitch = block_pitch_sh2 >> 2; 01345 idx = block_pitch_sh2 & 3; 01346 if (idx) { 01347 ff_acelp_interpolatef(excitation, &excitation[-block_pitch], 01348 wmavoice_ipol2_coeffs, 4, 01349 idx, 8, size); 01350 } else 01351 av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch, 01352 sizeof(float) * size); 01353 } 01354 01355 /* Interpolate ACB/FCB and use as excitation signal */ 01356 ff_weighted_vector_sumf(excitation, excitation, pulses, 01357 acb_gain, fcb_gain, size); 01358 } 01359 01376 static void synth_block(WMAVoiceContext *s, GetBitContext *gb, 01377 int block_idx, int size, 01378 int block_pitch_sh2, 01379 const double *lsps, const double *prev_lsps, 01380 const struct frame_type_desc *frame_desc, 01381 float *excitation, float *synth) 01382 { 01383 double i_lsps[MAX_LSPS]; 01384 float lpcs[MAX_LSPS]; 01385 float fac; 01386 int n; 01387 01388 if (frame_desc->acb_type == ACB_TYPE_NONE) 01389 synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation); 01390 else 01391 synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2, 01392 frame_desc, excitation); 01393 01394 /* convert interpolated LSPs to LPCs */ 01395 fac = (block_idx + 0.5) / frame_desc->n_blocks; 01396 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01397 i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n])); 01398 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01399 01400 /* Speech synthesis */ 01401 ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps); 01402 } 01403 01419 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, 01420 float *samples, 01421 const double *lsps, const double *prev_lsps, 01422 float *excitation, float *synth) 01423 { 01424 WMAVoiceContext *s = ctx->priv_data; 01425 int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val; 01426 int pitch[MAX_BLOCKS], last_block_pitch; 01427 01428 /* Parse frame type ("frame header"), see frame_descs */ 01429 int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], 01430 block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks; 01431 01432 if (bd_idx < 0) { 01433 av_log(ctx, AV_LOG_ERROR, 01434 "Invalid frame type VLC code, skipping\n"); 01435 return -1; 01436 } 01437 01438 /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */ 01439 if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) { 01440 /* Pitch is provided per frame, which is interpreted as the pitch of 01441 * the last sample of the last block of this frame. We can interpolate 01442 * the pitch of other blocks (and even pitch-per-sample) by gradually 01443 * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */ 01444 n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1; 01445 log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1; 01446 cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits); 01447 cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1); 01448 if (s->last_acb_type == ACB_TYPE_NONE || 01449 20 * abs(cur_pitch_val - s->last_pitch_val) > 01450 (cur_pitch_val + s->last_pitch_val)) 01451 s->last_pitch_val = cur_pitch_val; 01452 01453 /* pitch per block */ 01454 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 01455 int fac = n * 2 + 1; 01456 01457 pitch[n] = (MUL16(fac, cur_pitch_val) + 01458 MUL16((n_blocks_x2 - fac), s->last_pitch_val) + 01459 frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2; 01460 } 01461 01462 /* "pitch-diff-per-sample" for calculation of pitch per sample */ 01463 s->pitch_diff_sh16 = 01464 ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE; 01465 } 01466 01467 /* Global gain (if silence) and pitch-adaptive window coordinates */ 01468 switch (frame_descs[bd_idx].fcb_type) { 01469 case FCB_TYPE_SILENCE: 01470 s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)]; 01471 break; 01472 case FCB_TYPE_AW_PULSES: 01473 aw_parse_coords(s, gb, pitch); 01474 break; 01475 } 01476 01477 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 01478 int bl_pitch_sh2; 01479 01480 /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */ 01481 switch (frame_descs[bd_idx].acb_type) { 01482 case ACB_TYPE_HAMMING: { 01483 /* Pitch is given per block. Per-block pitches are encoded as an 01484 * absolute value for the first block, and then delta values 01485 * relative to this value) for all subsequent blocks. The scale of 01486 * this pitch value is semi-logaritmic compared to its use in the 01487 * decoder, so we convert it to normal scale also. */ 01488 int block_pitch, 01489 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2, 01490 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1, 01491 t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1; 01492 01493 if (n == 0) { 01494 block_pitch = get_bits(gb, s->block_pitch_nbits); 01495 } else 01496 block_pitch = last_block_pitch - s->block_delta_pitch_hrange + 01497 get_bits(gb, s->block_delta_pitch_nbits); 01498 /* Convert last_ so that any next delta is within _range */ 01499 last_block_pitch = av_clip(block_pitch, 01500 s->block_delta_pitch_hrange, 01501 s->block_pitch_range - 01502 s->block_delta_pitch_hrange); 01503 01504 /* Convert semi-log-style scale back to normal scale */ 01505 if (block_pitch < t1) { 01506 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch; 01507 } else { 01508 block_pitch -= t1; 01509 if (block_pitch < t2) { 01510 bl_pitch_sh2 = 01511 (s->block_conv_table[1] << 2) + (block_pitch << 1); 01512 } else { 01513 block_pitch -= t2; 01514 if (block_pitch < t3) { 01515 bl_pitch_sh2 = 01516 (s->block_conv_table[2] + block_pitch) << 2; 01517 } else 01518 bl_pitch_sh2 = s->block_conv_table[3] << 2; 01519 } 01520 } 01521 pitch[n] = bl_pitch_sh2 >> 2; 01522 break; 01523 } 01524 01525 case ACB_TYPE_ASYMMETRIC: { 01526 bl_pitch_sh2 = pitch[n] << 2; 01527 break; 01528 } 01529 01530 default: // ACB_TYPE_NONE has no pitch 01531 bl_pitch_sh2 = 0; 01532 break; 01533 } 01534 01535 synth_block(s, gb, n, block_nsamples, bl_pitch_sh2, 01536 lsps, prev_lsps, &frame_descs[bd_idx], 01537 &excitation[n * block_nsamples], 01538 &synth[n * block_nsamples]); 01539 } 01540 01541 /* Averaging projection filter, if applicable. Else, just copy samples 01542 * from synthesis buffer */ 01543 if (s->do_apf) { 01544 double i_lsps[MAX_LSPS]; 01545 float lpcs[MAX_LSPS]; 01546 01547 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01548 i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n])); 01549 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01550 postfilter(s, synth, samples, 80, lpcs, 01551 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx], 01552 frame_descs[bd_idx].fcb_type, pitch[0]); 01553 01554 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01555 i_lsps[n] = cos(lsps[n]); 01556 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01557 postfilter(s, &synth[80], &samples[80], 80, lpcs, 01558 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80], 01559 frame_descs[bd_idx].fcb_type, pitch[0]); 01560 } else 01561 memcpy(samples, synth, 160 * sizeof(synth[0])); 01562 01563 /* Cache values for next frame */ 01564 s->frame_cntr++; 01565 if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%) 01566 s->last_acb_type = frame_descs[bd_idx].acb_type; 01567 switch (frame_descs[bd_idx].acb_type) { 01568 case ACB_TYPE_NONE: 01569 s->last_pitch_val = 0; 01570 break; 01571 case ACB_TYPE_ASYMMETRIC: 01572 s->last_pitch_val = cur_pitch_val; 01573 break; 01574 case ACB_TYPE_HAMMING: 01575 s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1]; 01576 break; 01577 } 01578 01579 return 0; 01580 } 01581 01594 static void stabilize_lsps(double *lsps, int num) 01595 { 01596 int n, m, l; 01597 01598 /* set minimum value for first, maximum value for last and minimum 01599 * spacing between LSF values. 01600 * Very similar to ff_set_min_dist_lsf(), but in double. */ 01601 lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI); 01602 for (n = 1; n < num; n++) 01603 lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI); 01604 lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI); 01605 01606 /* reorder (looks like one-time / non-recursed bubblesort). 01607 * Very similar to ff_sort_nearly_sorted_floats(), but in double. */ 01608 for (n = 1; n < num; n++) { 01609 if (lsps[n] < lsps[n - 1]) { 01610 for (m = 1; m < num; m++) { 01611 double tmp = lsps[m]; 01612 for (l = m - 1; l >= 0; l--) { 01613 if (lsps[l] <= tmp) break; 01614 lsps[l + 1] = lsps[l]; 01615 } 01616 lsps[l + 1] = tmp; 01617 } 01618 break; 01619 } 01620 } 01621 } 01622 01632 static int check_bits_for_superframe(GetBitContext *orig_gb, 01633 WMAVoiceContext *s) 01634 { 01635 GetBitContext s_gb, *gb = &s_gb; 01636 int n, need_bits, bd_idx; 01637 const struct frame_type_desc *frame_desc; 01638 01639 /* initialize a copy */ 01640 init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits); 01641 skip_bits_long(gb, get_bits_count(orig_gb)); 01642 assert(get_bits_left(gb) == get_bits_left(orig_gb)); 01643 01644 /* superframe header */ 01645 if (get_bits_left(gb) < 14) 01646 return 1; 01647 if (!get_bits1(gb)) 01648 return -1; // WMAPro-in-WMAVoice superframe 01649 if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe 01650 if (s->has_residual_lsps) { // residual LSPs (for all frames) 01651 if (get_bits_left(gb) < s->sframe_lsp_bitsize) 01652 return 1; 01653 skip_bits_long(gb, s->sframe_lsp_bitsize); 01654 } 01655 01656 /* frames */ 01657 for (n = 0; n < MAX_FRAMES; n++) { 01658 int aw_idx_is_ext = 0; 01659 01660 if (!s->has_residual_lsps) { // independent LSPs (per-frame) 01661 if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1; 01662 skip_bits_long(gb, s->frame_lsp_bitsize); 01663 } 01664 bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)]; 01665 if (bd_idx < 0) 01666 return -1; // invalid frame type VLC code 01667 frame_desc = &frame_descs[bd_idx]; 01668 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { 01669 if (get_bits_left(gb) < s->pitch_nbits) 01670 return 1; 01671 skip_bits_long(gb, s->pitch_nbits); 01672 } 01673 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { 01674 skip_bits(gb, 8); 01675 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01676 int tmp = get_bits(gb, 6); 01677 if (tmp >= 0x36) { 01678 skip_bits(gb, 2); 01679 aw_idx_is_ext = 1; 01680 } 01681 } 01682 01683 /* blocks */ 01684 if (frame_desc->acb_type == ACB_TYPE_HAMMING) { 01685 need_bits = s->block_pitch_nbits + 01686 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits; 01687 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01688 need_bits = 2 * !aw_idx_is_ext; 01689 } else 01690 need_bits = 0; 01691 need_bits += frame_desc->frame_size; 01692 if (get_bits_left(gb) < need_bits) 01693 return 1; 01694 skip_bits_long(gb, need_bits); 01695 } 01696 01697 return 0; 01698 } 01699 01720 static int synth_superframe(AVCodecContext *ctx, 01721 float *samples, int *data_size) 01722 { 01723 WMAVoiceContext *s = ctx->priv_data; 01724 GetBitContext *gb = &s->gb, s_gb; 01725 int n, res, n_samples = 480; 01726 double lsps[MAX_FRAMES][MAX_LSPS]; 01727 const double *mean_lsf = s->lsps == 16 ? 01728 wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode]; 01729 float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12]; 01730 float synth[MAX_LSPS + MAX_SFRAMESIZE]; 01731 01732 memcpy(synth, s->synth_history, 01733 s->lsps * sizeof(*synth)); 01734 memcpy(excitation, s->excitation_history, 01735 s->history_nsamples * sizeof(*excitation)); 01736 01737 if (s->sframe_cache_size > 0) { 01738 gb = &s_gb; 01739 init_get_bits(gb, s->sframe_cache, s->sframe_cache_size); 01740 s->sframe_cache_size = 0; 01741 } 01742 01743 if ((res = check_bits_for_superframe(gb, s)) == 1) return 1; 01744 01745 /* First bit is speech/music bit, it differentiates between WMAVoice 01746 * speech samples (the actual codec) and WMAVoice music samples, which 01747 * are really WMAPro-in-WMAVoice-superframes. I've never seen those in 01748 * the wild yet. */ 01749 if (!get_bits1(gb)) { 01750 av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1); 01751 return -1; 01752 } 01753 01754 /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */ 01755 if (get_bits1(gb)) { 01756 if ((n_samples = get_bits(gb, 12)) > 480) { 01757 av_log(ctx, AV_LOG_ERROR, 01758 "Superframe encodes >480 samples (%d), not allowed\n", 01759 n_samples); 01760 return -1; 01761 } 01762 } 01763 /* Parse LSPs, if global for the superframe (can also be per-frame). */ 01764 if (s->has_residual_lsps) { 01765 double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2]; 01766 01767 for (n = 0; n < s->lsps; n++) 01768 prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n]; 01769 01770 if (s->lsps == 10) { 01771 dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode); 01772 } else /* s->lsps == 16 */ 01773 dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode); 01774 01775 for (n = 0; n < s->lsps; n++) { 01776 lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]); 01777 lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]); 01778 lsps[2][n] += mean_lsf[n]; 01779 } 01780 for (n = 0; n < 3; n++) 01781 stabilize_lsps(lsps[n], s->lsps); 01782 } 01783 01784 /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */ 01785 for (n = 0; n < 3; n++) { 01786 if (!s->has_residual_lsps) { 01787 int m; 01788 01789 if (s->lsps == 10) { 01790 dequant_lsp10i(gb, lsps[n]); 01791 } else /* s->lsps == 16 */ 01792 dequant_lsp16i(gb, lsps[n]); 01793 01794 for (m = 0; m < s->lsps; m++) 01795 lsps[n][m] += mean_lsf[m]; 01796 stabilize_lsps(lsps[n], s->lsps); 01797 } 01798 01799 if ((res = synth_frame(ctx, gb, n, 01800 &samples[n * MAX_FRAMESIZE], 01801 lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1], 01802 &excitation[s->history_nsamples + n * MAX_FRAMESIZE], 01803 &synth[s->lsps + n * MAX_FRAMESIZE]))) 01804 return res; 01805 } 01806 01807 /* Statistics? FIXME - we don't check for length, a slight overrun 01808 * will be caught by internal buffer padding, and anything else 01809 * will be skipped, not read. */ 01810 if (get_bits1(gb)) { 01811 res = get_bits(gb, 4); 01812 skip_bits(gb, 10 * (res + 1)); 01813 } 01814 01815 /* Specify nr. of output samples */ 01816 *data_size = n_samples * sizeof(float); 01817 01818 /* Update history */ 01819 memcpy(s->prev_lsps, lsps[2], 01820 s->lsps * sizeof(*s->prev_lsps)); 01821 memcpy(s->synth_history, &synth[MAX_SFRAMESIZE], 01822 s->lsps * sizeof(*synth)); 01823 memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE], 01824 s->history_nsamples * sizeof(*excitation)); 01825 if (s->do_apf) 01826 memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE], 01827 s->history_nsamples * sizeof(*s->zero_exc_pf)); 01828 01829 return 0; 01830 } 01831 01839 static int parse_packet_header(WMAVoiceContext *s) 01840 { 01841 GetBitContext *gb = &s->gb; 01842 unsigned int res; 01843 01844 if (get_bits_left(gb) < 11) 01845 return 1; 01846 skip_bits(gb, 4); // packet sequence number 01847 s->has_residual_lsps = get_bits1(gb); 01848 do { 01849 res = get_bits(gb, 6); // number of superframes per packet 01850 // (minus first one if there is spillover) 01851 if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize) 01852 return 1; 01853 } while (res == 0x3F); 01854 s->spillover_nbits = get_bits(gb, s->spillover_bitsize); 01855 01856 return 0; 01857 } 01858 01874 static void copy_bits(PutBitContext *pb, 01875 const uint8_t *data, int size, 01876 GetBitContext *gb, int nbits) 01877 { 01878 int rmn_bytes, rmn_bits; 01879 01880 rmn_bits = rmn_bytes = get_bits_left(gb); 01881 if (rmn_bits < nbits) 01882 return; 01883 rmn_bits &= 7; rmn_bytes >>= 3; 01884 if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0) 01885 put_bits(pb, rmn_bits, get_bits(gb, rmn_bits)); 01886 ff_copy_bits(pb, data + size - rmn_bytes, 01887 FFMIN(nbits - rmn_bits, rmn_bytes << 3)); 01888 } 01889 01901 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, 01902 int *data_size, AVPacket *avpkt) 01903 { 01904 WMAVoiceContext *s = ctx->priv_data; 01905 GetBitContext *gb = &s->gb; 01906 int size, res, pos; 01907 01908 if (*data_size < 480 * sizeof(float)) { 01909 av_log(ctx, AV_LOG_ERROR, 01910 "Output buffer too small (%d given - %zu needed)\n", 01911 *data_size, 480 * sizeof(float)); 01912 return -1; 01913 } 01914 *data_size = 0; 01915 01916 /* Packets are sometimes a multiple of ctx->block_align, with a packet 01917 * header at each ctx->block_align bytes. However, Libav's ASF demuxer 01918 * feeds us ASF packets, which may concatenate multiple "codec" packets 01919 * in a single "muxer" packet, so we artificially emulate that by 01920 * capping the packet size at ctx->block_align. */ 01921 for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align); 01922 if (!size) 01923 return 0; 01924 init_get_bits(&s->gb, avpkt->data, size << 3); 01925 01926 /* size == ctx->block_align is used to indicate whether we are dealing with 01927 * a new packet or a packet of which we already read the packet header 01928 * previously. */ 01929 if (size == ctx->block_align) { // new packet header 01930 if ((res = parse_packet_header(s)) < 0) 01931 return res; 01932 01933 /* If the packet header specifies a s->spillover_nbits, then we want 01934 * to push out all data of the previous packet (+ spillover) before 01935 * continuing to parse new superframes in the current packet. */ 01936 if (s->spillover_nbits > 0) { 01937 if (s->sframe_cache_size > 0) { 01938 int cnt = get_bits_count(gb); 01939 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits); 01940 flush_put_bits(&s->pb); 01941 s->sframe_cache_size += s->spillover_nbits; 01942 if ((res = synth_superframe(ctx, data, data_size)) == 0 && 01943 *data_size > 0) { 01944 cnt += s->spillover_nbits; 01945 s->skip_bits_next = cnt & 7; 01946 return cnt >> 3; 01947 } else 01948 skip_bits_long (gb, s->spillover_nbits - cnt + 01949 get_bits_count(gb)); // resync 01950 } else 01951 skip_bits_long(gb, s->spillover_nbits); // resync 01952 } 01953 } else if (s->skip_bits_next) 01954 skip_bits(gb, s->skip_bits_next); 01955 01956 /* Try parsing superframes in current packet */ 01957 s->sframe_cache_size = 0; 01958 s->skip_bits_next = 0; 01959 pos = get_bits_left(gb); 01960 if ((res = synth_superframe(ctx, data, data_size)) < 0) { 01961 return res; 01962 } else if (*data_size > 0) { 01963 int cnt = get_bits_count(gb); 01964 s->skip_bits_next = cnt & 7; 01965 return cnt >> 3; 01966 } else if ((s->sframe_cache_size = pos) > 0) { 01967 /* rewind bit reader to start of last (incomplete) superframe... */ 01968 init_get_bits(gb, avpkt->data, size << 3); 01969 skip_bits_long(gb, (size << 3) - pos); 01970 assert(get_bits_left(gb) == pos); 01971 01972 /* ...and cache it for spillover in next packet */ 01973 init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE); 01974 copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size); 01975 // FIXME bad - just copy bytes as whole and add use the 01976 // skip_bits_next field 01977 } 01978 01979 return size; 01980 } 01981 01982 static av_cold int wmavoice_decode_end(AVCodecContext *ctx) 01983 { 01984 WMAVoiceContext *s = ctx->priv_data; 01985 01986 if (s->do_apf) { 01987 ff_rdft_end(&s->rdft); 01988 ff_rdft_end(&s->irdft); 01989 ff_dct_end(&s->dct); 01990 ff_dct_end(&s->dst); 01991 } 01992 01993 return 0; 01994 } 01995 01996 static av_cold void wmavoice_flush(AVCodecContext *ctx) 01997 { 01998 WMAVoiceContext *s = ctx->priv_data; 01999 int n; 02000 02001 s->postfilter_agc = 0; 02002 s->sframe_cache_size = 0; 02003 s->skip_bits_next = 0; 02004 for (n = 0; n < s->lsps; n++) 02005 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0); 02006 memset(s->excitation_history, 0, 02007 sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY); 02008 memset(s->synth_history, 0, 02009 sizeof(*s->synth_history) * MAX_LSPS); 02010 memset(s->gain_pred_err, 0, 02011 sizeof(s->gain_pred_err)); 02012 02013 if (s->do_apf) { 02014 memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0, 02015 sizeof(*s->synth_filter_out_buf) * s->lsps); 02016 memset(s->dcf_mem, 0, 02017 sizeof(*s->dcf_mem) * 2); 02018 memset(s->zero_exc_pf, 0, 02019 sizeof(*s->zero_exc_pf) * s->history_nsamples); 02020 memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache)); 02021 } 02022 } 02023 02024 AVCodec ff_wmavoice_decoder = { 02025 "wmavoice", 02026 AVMEDIA_TYPE_AUDIO, 02027 CODEC_ID_WMAVOICE, 02028 sizeof(WMAVoiceContext), 02029 wmavoice_decode_init, 02030 NULL, 02031 wmavoice_decode_end, 02032 wmavoice_decode_packet, 02033 CODEC_CAP_SUBFRAMES, 02034 .flush = wmavoice_flush, 02035 .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"), 02036 };