wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
28 #define UNCHECKED_BITSTREAM_READER 1
29 
30 #include <math.h>
31 #include "avcodec.h"
32 #include "get_bits.h"
33 #include "put_bits.h"
34 #include "wmavoice_data.h"
35 #include "celp_math.h"
36 #include "celp_filters.h"
37 #include "acelp_vectors.h"
38 #include "acelp_filters.h"
39 #include "lsp.h"
40 #include "libavutil/lzo.h"
41 #include "dct.h"
42 #include "rdft.h"
43 #include "sinewin.h"
44 
45 #define MAX_BLOCKS 8
46 #define MAX_LSPS 16
47 #define MAX_LSPS_ALIGN16 16
48 #define MAX_FRAMES 3
50 #define MAX_FRAMESIZE 160
51 #define MAX_SIGNAL_HISTORY 416
52 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
53 #define SFRAME_CACHE_MAXSIZE 256
55 #define VLC_NBITS 6
57 
58 
62 
66 enum {
69  ACB_TYPE_HAMMING = 2
74 };
77 
81 enum {
83  FCB_TYPE_HARDCODED = 1,
86  FCB_TYPE_AW_PULSES = 2,
90 };
93 
97 static const struct frame_type_desc {
98  uint8_t n_blocks;
99  uint8_t log_n_blocks;
101  uint8_t acb_type;
102  uint8_t fcb_type;
103  uint8_t dbl_pulses;
104  uint16_t frame_size;
107 } frame_descs[17] = {
109  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
110  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
111  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
112  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
113  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
114  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
115  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
116  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
117  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
118  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
119  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
120  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
121  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
122  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
123  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
124  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
125  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
126 };
127 
131 typedef struct {
138  int8_t vbm_tree[25];
142 
144  int history_nsamples;
147 
149  /* postfilter specific values */
150  int do_apf;
151  int denoise_strength;
153  int denoise_tilt_corr;
155  int dc_level;
157 
159  int lsps;
162  int frame_lsp_bitsize;
164  int sframe_lsp_bitsize;
166 
171  int block_pitch_nbits;
173  int block_pitch_range;
176  int block_delta_pitch_hrange;
180  uint16_t block_conv_table[4];
182 
194  int has_residual_lsps;
198  int skip_bits_next;
203 
210  PutBitContext pb;
215 
225  double prev_lsps[MAX_LSPS];
226  int last_pitch_val;
230  float silence_gain;
232 
234  int aw_pulse_range;
236  int aw_n_pulses[2];
242  int aw_first_pulse_off[2];
245  int aw_next_pulse_off_cache;
247 
253  float gain_pred_err[6];
255  float excitation_history[MAX_SIGNAL_HISTORY];
259  float synth_history[MAX_LSPS];
260 
269  RDFTContext rdft, irdft;
270  DCTContext dct, dst;
272  float sin[511], cos[511];
274  float postfilter_agc;
276  float dcf_mem[2];
278  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
281  float denoise_filter_cache[MAX_FRAMESIZE];
283  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
285  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
287  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
290 
294 
304 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
305 {
306  static const uint8_t bits[] = {
307  2, 2, 2, 4, 4, 4,
308  6, 6, 6, 8, 8, 8,
309  10, 10, 10, 12, 12, 12,
310  14, 14, 14, 14
311  };
312  static const uint16_t codes[] = {
313  0x0000, 0x0001, 0x0002, // 00/01/10
314  0x000c, 0x000d, 0x000e, // 11+00/01/10
315  0x003c, 0x003d, 0x003e, // 1111+00/01/10
316  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
317  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
318  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
319  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
320  };
321  int cntr[8], n, res;
322 
323  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
324  memset(cntr, 0, sizeof(cntr));
325  for (n = 0; n < 17; n++) {
326  res = get_bits(gb, 3);
327  if (cntr[res] > 3) // should be >= 3 + (res == 7))
328  return -1;
329  vbm_tree[res * 3 + cntr[res]++] = n;
330  }
331  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
332  bits, 1, 1, codes, 2, 2, 132);
333  return 0;
334 }
335 
340 {
341  int n, flags, pitch_range, lsp16_flag;
342  WMAVoiceContext *s = ctx->priv_data;
343 
352  if (ctx->extradata_size != 46) {
353  av_log(ctx, AV_LOG_ERROR,
354  "Invalid extradata size %d (should be 46)\n",
355  ctx->extradata_size);
356  return -1;
357  }
358  flags = AV_RL32(ctx->extradata + 18);
359  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
360  s->do_apf = flags & 0x1;
361  if (s->do_apf) {
362  ff_rdft_init(&s->rdft, 7, DFT_R2C);
363  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
364  ff_dct_init(&s->dct, 6, DCT_I);
365  ff_dct_init(&s->dst, 6, DST_I);
366 
367  ff_sine_window_init(s->cos, 256);
368  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
369  for (n = 0; n < 255; n++) {
370  s->sin[n] = -s->sin[510 - n];
371  s->cos[510 - n] = s->cos[n];
372  }
373  }
374  s->denoise_strength = (flags >> 2) & 0xF;
375  if (s->denoise_strength >= 12) {
376  av_log(ctx, AV_LOG_ERROR,
377  "Invalid denoise filter strength %d (max=11)\n",
378  s->denoise_strength);
379  return -1;
380  }
381  s->denoise_tilt_corr = !!(flags & 0x40);
382  s->dc_level = (flags >> 7) & 0xF;
383  s->lsp_q_mode = !!(flags & 0x2000);
384  s->lsp_def_mode = !!(flags & 0x4000);
385  lsp16_flag = flags & 0x1000;
386  if (lsp16_flag) {
387  s->lsps = 16;
388  s->frame_lsp_bitsize = 34;
389  s->sframe_lsp_bitsize = 60;
390  } else {
391  s->lsps = 10;
392  s->frame_lsp_bitsize = 24;
393  s->sframe_lsp_bitsize = 48;
394  }
395  for (n = 0; n < s->lsps; n++)
396  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
397 
398  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
399  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
400  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
401  return -1;
402  }
403 
404  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
405  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
406  pitch_range = s->max_pitch_val - s->min_pitch_val;
407  if (pitch_range <= 0) {
408  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
409  return -1;
410  }
411  s->pitch_nbits = av_ceil_log2(pitch_range);
412  s->last_pitch_val = 40;
414  s->history_nsamples = s->max_pitch_val + 8;
415 
417  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
418  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
419 
420  av_log(ctx, AV_LOG_ERROR,
421  "Unsupported samplerate %d (min=%d, max=%d)\n",
422  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
423 
424  return -1;
425  }
426 
427  s->block_conv_table[0] = s->min_pitch_val;
428  s->block_conv_table[1] = (pitch_range * 25) >> 6;
429  s->block_conv_table[2] = (pitch_range * 44) >> 6;
430  s->block_conv_table[3] = s->max_pitch_val - 1;
431  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
432  if (s->block_delta_pitch_hrange <= 0) {
433  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
434  return -1;
435  }
436  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
438  s->block_conv_table[3] + 1 +
439  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
440  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
441 
443 
445  ctx->coded_frame = &s->frame;
446 
447  return 0;
448 }
449 
471 static void adaptive_gain_control(float *out, const float *in,
472  const float *speech_synth,
473  int size, float alpha, float *gain_mem)
474 {
475  int i;
476  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
477  float mem = *gain_mem;
478 
479  for (i = 0; i < size; i++) {
480  speech_energy += fabsf(speech_synth[i]);
481  postfilter_energy += fabsf(in[i]);
482  }
483  gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
484 
485  for (i = 0; i < size; i++) {
486  mem = alpha * mem + gain_scale_factor;
487  out[i] = in[i] * mem;
488  }
489 
490  *gain_mem = mem;
491 }
492 
511 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
512  const float *in, float *out, int size)
513 {
514  int n;
515  float optimal_gain = 0, dot;
516  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
517  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
518  *best_hist_ptr;
519 
520  /* find best fitting point in history */
521  do {
522  dot = ff_dot_productf(in, ptr, size);
523  if (dot > optimal_gain) {
524  optimal_gain = dot;
525  best_hist_ptr = ptr;
526  }
527  } while (--ptr >= end);
528 
529  if (optimal_gain <= 0)
530  return -1;
531  dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size);
532  if (dot <= 0) // would be 1.0
533  return -1;
534 
535  if (optimal_gain <= dot) {
536  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
537  } else
538  dot = 0.625;
539 
540  /* actual smoothing */
541  for (n = 0; n < size; n++)
542  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
543 
544  return 0;
545 }
546 
557 static float tilt_factor(const float *lpcs, int n_lpcs)
558 {
559  float rh0, rh1;
560 
561  rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs);
562  rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1);
563 
564  return rh1 / rh0;
565 }
566 
570 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
571  int fcb_type, float *coeffs, int remainder)
572 {
573  float last_coeff, min = 15.0, max = -15.0;
574  float irange, angle_mul, gain_mul, range, sq;
575  int n, idx;
576 
577  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
578  s->rdft.rdft_calc(&s->rdft, lpcs);
579 #define log_range(var, assign) do { \
580  float tmp = log10f(assign); var = tmp; \
581  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
582  } while (0)
583  log_range(last_coeff, lpcs[1] * lpcs[1]);
584  for (n = 1; n < 64; n++)
585  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
586  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
587  log_range(lpcs[0], lpcs[0] * lpcs[0]);
588 #undef log_range
589  range = max - min;
590  lpcs[64] = last_coeff;
591 
592  /* Now, use this spectrum to pick out these frequencies with higher
593  * (relative) power/energy (which we then take to be "not noise"),
594  * and set up a table (still in lpc[]) of (relative) gains per frequency.
595  * These frequencies will be maintained, while others ("noise") will be
596  * decreased in the filter output. */
597  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
598  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
599  (5.0 / 14.7));
600  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
601  for (n = 0; n <= 64; n++) {
602  float pwr;
603 
604  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
606  lpcs[n] = angle_mul * pwr;
607 
608  /* 70.57 =~ 1/log10(1.0331663) */
609  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
610  if (idx > 127) { // fallback if index falls outside table range
611  coeffs[n] = wmavoice_energy_table[127] *
612  powf(1.0331663, idx - 127);
613  } else
614  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
615  }
616 
617  /* calculate the Hilbert transform of the gains, which we do (since this
618  * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
619  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
620  * "moment" of the LPCs in this filter. */
621  s->dct.dct_calc(&s->dct, lpcs);
622  s->dst.dct_calc(&s->dst, lpcs);
623 
624  /* Split out the coefficient indexes into phase/magnitude pairs */
625  idx = 255 + av_clip(lpcs[64], -255, 255);
626  coeffs[0] = coeffs[0] * s->cos[idx];
627  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
628  last_coeff = coeffs[64] * s->cos[idx];
629  for (n = 63;; n--) {
630  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
631  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
632  coeffs[n * 2] = coeffs[n] * s->cos[idx];
633 
634  if (!--n) break;
635 
636  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
637  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
638  coeffs[n * 2] = coeffs[n] * s->cos[idx];
639  }
640  coeffs[1] = last_coeff;
641 
642  /* move into real domain */
643  s->irdft.rdft_calc(&s->irdft, coeffs);
644 
645  /* tilt correction and normalize scale */
646  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
647  if (s->denoise_tilt_corr) {
648  float tilt_mem = 0;
649 
650  coeffs[remainder - 1] = 0;
651  ff_tilt_compensation(&tilt_mem,
652  -1.8 * tilt_factor(coeffs, remainder - 1),
653  coeffs, remainder);
654  }
655  sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder));
656  for (n = 0; n < remainder; n++)
657  coeffs[n] *= sq;
658 }
659 
686 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
687  float *synth_pf, int size,
688  const float *lpcs)
689 {
690  int remainder, lim, n;
691 
692  if (fcb_type != FCB_TYPE_SILENCE) {
693  float *tilted_lpcs = s->tilted_lpcs_pf,
694  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
695 
696  tilted_lpcs[0] = 1.0;
697  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
698  memset(&tilted_lpcs[s->lsps + 1], 0,
699  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
700  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
701  tilted_lpcs, s->lsps + 2);
702 
703  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
704  * size is applied to the next frame. All input beyond this is zero,
705  * and thus all output beyond this will go towards zero, hence we can
706  * limit to min(size-1, 127-size) as a performance consideration. */
707  remainder = FFMIN(127 - size, size - 1);
708  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
709 
710  /* apply coefficients (in frequency spectrum domain), i.e. complex
711  * number multiplication */
712  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
713  s->rdft.rdft_calc(&s->rdft, synth_pf);
714  s->rdft.rdft_calc(&s->rdft, coeffs);
715  synth_pf[0] *= coeffs[0];
716  synth_pf[1] *= coeffs[1];
717  for (n = 1; n < 64; n++) {
718  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
719  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
720  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
721  }
722  s->irdft.rdft_calc(&s->irdft, synth_pf);
723  }
724 
725  /* merge filter output with the history of previous runs */
726  if (s->denoise_filter_cache_size) {
727  lim = FFMIN(s->denoise_filter_cache_size, size);
728  for (n = 0; n < lim; n++)
729  synth_pf[n] += s->denoise_filter_cache[n];
730  s->denoise_filter_cache_size -= lim;
731  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
733  }
734 
735  /* move remainder of filter output into a cache for future runs */
736  if (fcb_type != FCB_TYPE_SILENCE) {
737  lim = FFMIN(remainder, s->denoise_filter_cache_size);
738  for (n = 0; n < lim; n++)
739  s->denoise_filter_cache[n] += synth_pf[size + n];
740  if (lim < remainder) {
741  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
742  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
743  s->denoise_filter_cache_size = remainder;
744  }
745  }
746 }
747 
768 static void postfilter(WMAVoiceContext *s, const float *synth,
769  float *samples, int size,
770  const float *lpcs, float *zero_exc_pf,
771  int fcb_type, int pitch)
772 {
773  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
774  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
775  *synth_filter_in = zero_exc_pf;
776 
777  assert(size <= MAX_FRAMESIZE / 2);
778 
779  /* generate excitation from input signal */
780  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
781 
782  if (fcb_type >= FCB_TYPE_AW_PULSES &&
783  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
784  synth_filter_in = synth_filter_in_buf;
785 
786  /* re-synthesize speech after smoothening, and keep history */
787  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
788  synth_filter_in, size, s->lsps);
789  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
790  sizeof(synth_pf[0]) * s->lsps);
791 
792  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
793 
794  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
795  &s->postfilter_agc);
796 
797  if (s->dc_level > 8) {
798  /* remove ultra-low frequency DC noise / highpass filter;
799  * coefficients are identical to those used in SIPR decoding,
800  * and very closely resemble those used in AMR-NB decoding. */
802  (const float[2]) { -1.99997, 1.0 },
803  (const float[2]) { -1.9330735188, 0.93589198496 },
804  0.93980580475, s->dcf_mem, size);
805  }
806 }
822 static void dequant_lsps(double *lsps, int num,
823  const uint16_t *values,
824  const uint16_t *sizes,
825  int n_stages, const uint8_t *table,
826  const double *mul_q,
827  const double *base_q)
828 {
829  int n, m;
830 
831  memset(lsps, 0, num * sizeof(*lsps));
832  for (n = 0; n < n_stages; n++) {
833  const uint8_t *t_off = &table[values[n] * num];
834  double base = base_q[n], mul = mul_q[n];
835 
836  for (m = 0; m < num; m++)
837  lsps[m] += base + mul * t_off[m];
838 
839  table += sizes[n] * num;
840  }
841 }
842 
854 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
855 {
856  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
857  static const double mul_lsf[4] = {
858  5.2187144800e-3, 1.4626986422e-3,
859  9.6179549166e-4, 1.1325736225e-3
860  };
861  static const double base_lsf[4] = {
862  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
863  M_PI * -3.3486e-2, M_PI * -5.7408e-2
864  };
865  uint16_t v[4];
866 
867  v[0] = get_bits(gb, 8);
868  v[1] = get_bits(gb, 6);
869  v[2] = get_bits(gb, 5);
870  v[3] = get_bits(gb, 5);
871 
872  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
873  mul_lsf, base_lsf);
874 }
875 
881  double *i_lsps, const double *old,
882  double *a1, double *a2, int q_mode)
883 {
884  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
885  static const double mul_lsf[3] = {
886  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
887  };
888  static const double base_lsf[3] = {
889  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
890  };
891  const float (*ipol_tab)[2][10] = q_mode ?
893  uint16_t interpol, v[3];
894  int n;
895 
896  dequant_lsp10i(gb, i_lsps);
897 
898  interpol = get_bits(gb, 5);
899  v[0] = get_bits(gb, 7);
900  v[1] = get_bits(gb, 6);
901  v[2] = get_bits(gb, 6);
902 
903  for (n = 0; n < 10; n++) {
904  double delta = old[n] - i_lsps[n];
905  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
906  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
907  }
908 
909  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
910  mul_lsf, base_lsf);
911 }
912 
916 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
917 {
918  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
919  static const double mul_lsf[5] = {
920  3.3439586280e-3, 6.9908173703e-4,
921  3.3216608306e-3, 1.0334960326e-3,
922  3.1899104283e-3
923  };
924  static const double base_lsf[5] = {
925  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
926  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
927  M_PI * -1.29816e-1
928  };
929  uint16_t v[5];
930 
931  v[0] = get_bits(gb, 8);
932  v[1] = get_bits(gb, 6);
933  v[2] = get_bits(gb, 7);
934  v[3] = get_bits(gb, 6);
935  v[4] = get_bits(gb, 7);
936 
937  dequant_lsps( lsps, 5, v, vec_sizes, 2,
938  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
939  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
940  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
941  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
942  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
943 }
944 
950  double *i_lsps, const double *old,
951  double *a1, double *a2, int q_mode)
952 {
953  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
954  static const double mul_lsf[3] = {
955  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
956  };
957  static const double base_lsf[3] = {
958  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
959  };
960  const float (*ipol_tab)[2][16] = q_mode ?
962  uint16_t interpol, v[3];
963  int n;
964 
965  dequant_lsp16i(gb, i_lsps);
966 
967  interpol = get_bits(gb, 5);
968  v[0] = get_bits(gb, 7);
969  v[1] = get_bits(gb, 7);
970  v[2] = get_bits(gb, 7);
971 
972  for (n = 0; n < 16; n++) {
973  double delta = old[n] - i_lsps[n];
974  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
975  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
976  }
977 
978  dequant_lsps( a2, 10, v, vec_sizes, 1,
979  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
980  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
981  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
982  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
983  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
984 }
985 
1000  const int *pitch)
1001 {
1002  static const int16_t start_offset[94] = {
1003  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1004  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1005  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1006  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1007  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1008  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1009  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1010  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1011  };
1012  int bits, offset;
1013 
1014  /* position of pulse */
1015  s->aw_idx_is_ext = 0;
1016  if ((bits = get_bits(gb, 6)) >= 54) {
1017  s->aw_idx_is_ext = 1;
1018  bits += (bits - 54) * 3 + get_bits(gb, 2);
1019  }
1020 
1021  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1022  * the distribution of the pulses in each block contained in this frame. */
1023  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1024  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1025  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1026  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1027  offset += s->aw_n_pulses[0] * pitch[0];
1028  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1029  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1030 
1031  /* if continuing from a position before the block, reset position to
1032  * start of block (when corrected for the range over which it can be
1033  * spread in aw_pulse_set1()). */
1034  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1035  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1036  s->aw_first_pulse_off[1] -= pitch[1];
1037  if (start_offset[bits] < 0)
1038  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1039  s->aw_first_pulse_off[0] -= pitch[0];
1040  }
1041 }
1042 
1052  int block_idx, AMRFixed *fcb)
1053 {
1054  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1055  uint16_t *use_mask = use_mask_mem + 2;
1056  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1057  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1058  * of idx are the position of the bit within a particular item in the
1059  * array (0 being the most significant bit, and 15 being the least
1060  * significant bit), and the remainder (>> 4) is the index in the
1061  * use_mask[]-array. This is faster and uses less memory than using a
1062  * 80-byte/80-int array. */
1063  int pulse_off = s->aw_first_pulse_off[block_idx],
1064  pulse_start, n, idx, range, aidx, start_off = 0;
1065 
1066  /* set offset of first pulse to within this block */
1067  if (s->aw_n_pulses[block_idx] > 0)
1068  while (pulse_off + s->aw_pulse_range < 1)
1069  pulse_off += fcb->pitch_lag;
1070 
1071  /* find range per pulse */
1072  if (s->aw_n_pulses[0] > 0) {
1073  if (block_idx == 0) {
1074  range = 32;
1075  } else /* block_idx = 1 */ {
1076  range = 8;
1077  if (s->aw_n_pulses[block_idx] > 0)
1078  pulse_off = s->aw_next_pulse_off_cache;
1079  }
1080  } else
1081  range = 16;
1082  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1083 
1084  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1085  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1086  * we exclude that range from being pulsed again in this function. */
1087  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1088  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1089  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1090  if (s->aw_n_pulses[block_idx] > 0)
1091  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1092  int excl_range = s->aw_pulse_range; // always 16 or 24
1093  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1094  int first_sh = 16 - (idx & 15);
1095  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1096  excl_range -= first_sh;
1097  if (excl_range >= 16) {
1098  *use_mask_ptr++ = 0;
1099  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1100  } else
1101  *use_mask_ptr &= 0xFFFF >> excl_range;
1102  }
1103 
1104  /* find the 'aidx'th offset that is not excluded */
1105  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1106  for (n = 0; n <= aidx; pulse_start++) {
1107  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1108  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1109  if (use_mask[0]) idx = 0x0F;
1110  else if (use_mask[1]) idx = 0x1F;
1111  else if (use_mask[2]) idx = 0x2F;
1112  else if (use_mask[3]) idx = 0x3F;
1113  else if (use_mask[4]) idx = 0x4F;
1114  else return -1;
1115  idx -= av_log2_16bit(use_mask[idx >> 4]);
1116  }
1117  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1118  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1119  n++;
1120  start_off = idx;
1121  }
1122  }
1123 
1124  fcb->x[fcb->n] = start_off;
1125  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1126  fcb->n++;
1127 
1128  /* set offset for next block, relative to start of that block */
1129  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1130  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1131  return 0;
1132 }
1133 
1142  int block_idx, AMRFixed *fcb)
1143 {
1144  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1145  float v;
1146 
1147  if (s->aw_n_pulses[block_idx] > 0) {
1148  int n, v_mask, i_mask, sh, n_pulses;
1149 
1150  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1151  n_pulses = 3;
1152  v_mask = 8;
1153  i_mask = 7;
1154  sh = 4;
1155  } else { // 4 pulses, 1:sign + 2:index each
1156  n_pulses = 4;
1157  v_mask = 4;
1158  i_mask = 3;
1159  sh = 3;
1160  }
1161 
1162  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1163  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1164  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1165  s->aw_first_pulse_off[block_idx];
1166  while (fcb->x[fcb->n] < 0)
1167  fcb->x[fcb->n] += fcb->pitch_lag;
1168  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1169  fcb->n++;
1170  }
1171  } else {
1172  int num2 = (val & 0x1FF) >> 1, delta, idx;
1173 
1174  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1175  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1176  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1177  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1178  v = (val & 0x200) ? -1.0 : 1.0;
1179 
1180  fcb->no_repeat_mask |= 3 << fcb->n;
1181  fcb->x[fcb->n] = idx - delta;
1182  fcb->y[fcb->n] = v;
1183  fcb->x[fcb->n + 1] = idx;
1184  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1185  fcb->n += 2;
1186  }
1187 }
1188 
1202 static int pRNG(int frame_cntr, int block_num, int block_size)
1203 {
1204  /* array to simplify the calculation of z:
1205  * y = (x % 9) * 5 + 6;
1206  * z = (49995 * x) / y;
1207  * Since y only has 9 values, we can remove the division by using a
1208  * LUT and using FASTDIV-style divisions. For each of the 9 values
1209  * of y, we can rewrite z as:
1210  * z = x * (49995 / y) + x * ((49995 % y) / y)
1211  * In this table, each col represents one possible value of y, the
1212  * first number is 49995 / y, and the second is the FASTDIV variant
1213  * of 49995 % y / y. */
1214  static const unsigned int div_tbl[9][2] = {
1215  { 8332, 3 * 715827883U }, // y = 6
1216  { 4545, 0 * 390451573U }, // y = 11
1217  { 3124, 11 * 268435456U }, // y = 16
1218  { 2380, 15 * 204522253U }, // y = 21
1219  { 1922, 23 * 165191050U }, // y = 26
1220  { 1612, 23 * 138547333U }, // y = 31
1221  { 1388, 27 * 119304648U }, // y = 36
1222  { 1219, 16 * 104755300U }, // y = 41
1223  { 1086, 39 * 93368855U } // y = 46
1224  };
1225  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1226  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1227  // so this is effectively a modulo (%)
1228  y = x - 9 * MULH(477218589, x); // x % 9
1229  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1230  // z = x * 49995 / (y * 5 + 6)
1231  return z % (1000 - block_size);
1232 }
1233 
1239  int block_idx, int size,
1240  const struct frame_type_desc *frame_desc,
1241  float *excitation)
1242 {
1243  float gain;
1244  int n, r_idx;
1245 
1246  assert(size <= MAX_FRAMESIZE);
1247 
1248  /* Set the offset from which we start reading wmavoice_std_codebook */
1249  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1250  r_idx = pRNG(s->frame_cntr, block_idx, size);
1251  gain = s->silence_gain;
1252  } else /* FCB_TYPE_HARDCODED */ {
1253  r_idx = get_bits(gb, 8);
1254  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1255  }
1256 
1257  /* Clear gain prediction parameters */
1258  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1259 
1260  /* Apply gain to hardcoded codebook and use that as excitation signal */
1261  for (n = 0; n < size; n++)
1262  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1263 }
1264 
1270  int block_idx, int size,
1271  int block_pitch_sh2,
1272  const struct frame_type_desc *frame_desc,
1273  float *excitation)
1274 {
1275  static const float gain_coeff[6] = {
1276  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1277  };
1278  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1279  int n, idx, gain_weight;
1280  AMRFixed fcb;
1281 
1282  assert(size <= MAX_FRAMESIZE / 2);
1283  memset(pulses, 0, sizeof(*pulses) * size);
1284 
1285  fcb.pitch_lag = block_pitch_sh2 >> 2;
1286  fcb.pitch_fac = 1.0;
1287  fcb.no_repeat_mask = 0;
1288  fcb.n = 0;
1289 
1290  /* For the other frame types, this is where we apply the innovation
1291  * (fixed) codebook pulses of the speech signal. */
1292  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1293  aw_pulse_set1(s, gb, block_idx, &fcb);
1294  if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1295  /* Conceal the block with silence and return.
1296  * Skip the correct amount of bits to read the next
1297  * block from the correct offset. */
1298  int r_idx = pRNG(s->frame_cntr, block_idx, size);
1299 
1300  for (n = 0; n < size; n++)
1301  excitation[n] =
1302  wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1303  skip_bits(gb, 7 + 1);
1304  return;
1305  }
1306  } else /* FCB_TYPE_EXC_PULSES */ {
1307  int offset_nbits = 5 - frame_desc->log_n_blocks;
1308 
1309  fcb.no_repeat_mask = -1;
1310  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1311  * (instead of double) for a subset of pulses */
1312  for (n = 0; n < 5; n++) {
1313  float sign;
1314  int pos1, pos2;
1315 
1316  sign = get_bits1(gb) ? 1.0 : -1.0;
1317  pos1 = get_bits(gb, offset_nbits);
1318  fcb.x[fcb.n] = n + 5 * pos1;
1319  fcb.y[fcb.n++] = sign;
1320  if (n < frame_desc->dbl_pulses) {
1321  pos2 = get_bits(gb, offset_nbits);
1322  fcb.x[fcb.n] = n + 5 * pos2;
1323  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1324  }
1325  }
1326  }
1327  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1328 
1329  /* Calculate gain for adaptive & fixed codebook signal.
1330  * see ff_amr_set_fixed_gain(). */
1331  idx = get_bits(gb, 7);
1332  fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
1333  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1334  acb_gain = wmavoice_gain_codebook_acb[idx];
1335  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1336  -2.9957322736 /* log(0.05) */,
1337  1.6094379124 /* log(5.0) */);
1338 
1339  gain_weight = 8 >> frame_desc->log_n_blocks;
1340  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1341  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1342  for (n = 0; n < gain_weight; n++)
1343  s->gain_pred_err[n] = pred_err;
1344 
1345  /* Calculation of adaptive codebook */
1346  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1347  int len;
1348  for (n = 0; n < size; n += len) {
1349  int next_idx_sh16;
1350  int abs_idx = block_idx * size + n;
1351  int pitch_sh16 = (s->last_pitch_val << 16) +
1352  s->pitch_diff_sh16 * abs_idx;
1353  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1354  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1355  idx = idx_sh16 >> 16;
1356  if (s->pitch_diff_sh16) {
1357  if (s->pitch_diff_sh16 > 0) {
1358  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1359  } else
1360  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1361  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1362  1, size - n);
1363  } else
1364  len = size;
1365 
1366  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1368  idx, 9, len);
1369  }
1370  } else /* ACB_TYPE_HAMMING */ {
1371  int block_pitch = block_pitch_sh2 >> 2;
1372  idx = block_pitch_sh2 & 3;
1373  if (idx) {
1374  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1376  idx, 8, size);
1377  } else
1378  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1379  sizeof(float) * size);
1380  }
1381 
1382  /* Interpolate ACB/FCB and use as excitation signal */
1383  ff_weighted_vector_sumf(excitation, excitation, pulses,
1384  acb_gain, fcb_gain, size);
1385 }
1386 
1404  int block_idx, int size,
1405  int block_pitch_sh2,
1406  const double *lsps, const double *prev_lsps,
1407  const struct frame_type_desc *frame_desc,
1408  float *excitation, float *synth)
1409 {
1410  double i_lsps[MAX_LSPS];
1411  float lpcs[MAX_LSPS];
1412  float fac;
1413  int n;
1414 
1415  if (frame_desc->acb_type == ACB_TYPE_NONE)
1416  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1417  else
1418  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1419  frame_desc, excitation);
1420 
1421  /* convert interpolated LSPs to LPCs */
1422  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1423  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1424  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1425  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1426 
1427  /* Speech synthesis */
1428  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1429 }
1430 
1446 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1447  float *samples,
1448  const double *lsps, const double *prev_lsps,
1449  float *excitation, float *synth)
1450 {
1451  WMAVoiceContext *s = ctx->priv_data;
1452  int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1453  int pitch[MAX_BLOCKS], last_block_pitch;
1454 
1455  /* Parse frame type ("frame header"), see frame_descs */
1456  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1457 
1458  if (bd_idx < 0) {
1459  av_log(ctx, AV_LOG_ERROR,
1460  "Invalid frame type VLC code, skipping\n");
1461  return -1;
1462  }
1463 
1464  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1465 
1466  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1467  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1468  /* Pitch is provided per frame, which is interpreted as the pitch of
1469  * the last sample of the last block of this frame. We can interpolate
1470  * the pitch of other blocks (and even pitch-per-sample) by gradually
1471  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1472  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1473  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1474  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1475  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1476  if (s->last_acb_type == ACB_TYPE_NONE ||
1477  20 * abs(cur_pitch_val - s->last_pitch_val) >
1478  (cur_pitch_val + s->last_pitch_val))
1479  s->last_pitch_val = cur_pitch_val;
1480 
1481  /* pitch per block */
1482  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1483  int fac = n * 2 + 1;
1484 
1485  pitch[n] = (MUL16(fac, cur_pitch_val) +
1486  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1487  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1488  }
1489 
1490  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1491  s->pitch_diff_sh16 =
1492  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1493  }
1494 
1495  /* Global gain (if silence) and pitch-adaptive window coordinates */
1496  switch (frame_descs[bd_idx].fcb_type) {
1497  case FCB_TYPE_SILENCE:
1499  break;
1500  case FCB_TYPE_AW_PULSES:
1501  aw_parse_coords(s, gb, pitch);
1502  break;
1503  }
1504 
1505  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1506  int bl_pitch_sh2;
1507 
1508  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1509  switch (frame_descs[bd_idx].acb_type) {
1510  case ACB_TYPE_HAMMING: {
1511  /* Pitch is given per block. Per-block pitches are encoded as an
1512  * absolute value for the first block, and then delta values
1513  * relative to this value) for all subsequent blocks. The scale of
1514  * this pitch value is semi-logaritmic compared to its use in the
1515  * decoder, so we convert it to normal scale also. */
1516  int block_pitch,
1517  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1518  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1519  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1520 
1521  if (n == 0) {
1522  block_pitch = get_bits(gb, s->block_pitch_nbits);
1523  } else
1524  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1526  /* Convert last_ so that any next delta is within _range */
1527  last_block_pitch = av_clip(block_pitch,
1529  s->block_pitch_range -
1531 
1532  /* Convert semi-log-style scale back to normal scale */
1533  if (block_pitch < t1) {
1534  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1535  } else {
1536  block_pitch -= t1;
1537  if (block_pitch < t2) {
1538  bl_pitch_sh2 =
1539  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1540  } else {
1541  block_pitch -= t2;
1542  if (block_pitch < t3) {
1543  bl_pitch_sh2 =
1544  (s->block_conv_table[2] + block_pitch) << 2;
1545  } else
1546  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1547  }
1548  }
1549  pitch[n] = bl_pitch_sh2 >> 2;
1550  break;
1551  }
1552 
1553  case ACB_TYPE_ASYMMETRIC: {
1554  bl_pitch_sh2 = pitch[n] << 2;
1555  break;
1556  }
1557 
1558  default: // ACB_TYPE_NONE has no pitch
1559  bl_pitch_sh2 = 0;
1560  break;
1561  }
1562 
1563  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1564  lsps, prev_lsps, &frame_descs[bd_idx],
1565  &excitation[n * block_nsamples],
1566  &synth[n * block_nsamples]);
1567  }
1568 
1569  /* Averaging projection filter, if applicable. Else, just copy samples
1570  * from synthesis buffer */
1571  if (s->do_apf) {
1572  double i_lsps[MAX_LSPS];
1573  float lpcs[MAX_LSPS];
1574 
1575  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1576  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1577  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1578  postfilter(s, synth, samples, 80, lpcs,
1579  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1580  frame_descs[bd_idx].fcb_type, pitch[0]);
1581 
1582  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1583  i_lsps[n] = cos(lsps[n]);
1584  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1585  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1586  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1587  frame_descs[bd_idx].fcb_type, pitch[0]);
1588  } else
1589  memcpy(samples, synth, 160 * sizeof(synth[0]));
1590 
1591  /* Cache values for next frame */
1592  s->frame_cntr++;
1593  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1594  s->last_acb_type = frame_descs[bd_idx].acb_type;
1595  switch (frame_descs[bd_idx].acb_type) {
1596  case ACB_TYPE_NONE:
1597  s->last_pitch_val = 0;
1598  break;
1599  case ACB_TYPE_ASYMMETRIC:
1600  s->last_pitch_val = cur_pitch_val;
1601  break;
1602  case ACB_TYPE_HAMMING:
1603  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1604  break;
1605  }
1606 
1607  return 0;
1608 }
1609 
1622 static void stabilize_lsps(double *lsps, int num)
1623 {
1624  int n, m, l;
1625 
1626  /* set minimum value for first, maximum value for last and minimum
1627  * spacing between LSF values.
1628  * Very similar to ff_set_min_dist_lsf(), but in double. */
1629  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1630  for (n = 1; n < num; n++)
1631  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1632  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1633 
1634  /* reorder (looks like one-time / non-recursed bubblesort).
1635  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1636  for (n = 1; n < num; n++) {
1637  if (lsps[n] < lsps[n - 1]) {
1638  for (m = 1; m < num; m++) {
1639  double tmp = lsps[m];
1640  for (l = m - 1; l >= 0; l--) {
1641  if (lsps[l] <= tmp) break;
1642  lsps[l + 1] = lsps[l];
1643  }
1644  lsps[l + 1] = tmp;
1645  }
1646  break;
1647  }
1648  }
1649 }
1650 
1661  WMAVoiceContext *s)
1662 {
1663  GetBitContext s_gb, *gb = &s_gb;
1664  int n, need_bits, bd_idx;
1665  const struct frame_type_desc *frame_desc;
1666 
1667  /* initialize a copy */
1668  init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1669  skip_bits_long(gb, get_bits_count(orig_gb));
1670  assert(get_bits_left(gb) == get_bits_left(orig_gb));
1671 
1672  /* superframe header */
1673  if (get_bits_left(gb) < 14)
1674  return 1;
1675  if (!get_bits1(gb))
1676  return -1; // WMAPro-in-WMAVoice superframe
1677  if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1678  if (s->has_residual_lsps) { // residual LSPs (for all frames)
1679  if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1680  return 1;
1682  }
1683 
1684  /* frames */
1685  for (n = 0; n < MAX_FRAMES; n++) {
1686  int aw_idx_is_ext = 0;
1687 
1688  if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1689  if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1691  }
1692  bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1693  if (bd_idx < 0)
1694  return -1; // invalid frame type VLC code
1695  frame_desc = &frame_descs[bd_idx];
1696  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1697  if (get_bits_left(gb) < s->pitch_nbits)
1698  return 1;
1699  skip_bits_long(gb, s->pitch_nbits);
1700  }
1701  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1702  skip_bits(gb, 8);
1703  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1704  int tmp = get_bits(gb, 6);
1705  if (tmp >= 0x36) {
1706  skip_bits(gb, 2);
1707  aw_idx_is_ext = 1;
1708  }
1709  }
1710 
1711  /* blocks */
1712  if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1713  need_bits = s->block_pitch_nbits +
1714  (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1715  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1716  need_bits = 2 * !aw_idx_is_ext;
1717  } else
1718  need_bits = 0;
1719  need_bits += frame_desc->frame_size;
1720  if (get_bits_left(gb) < need_bits)
1721  return 1;
1722  skip_bits_long(gb, need_bits);
1723  }
1724 
1725  return 0;
1726 }
1727 
1748 static int synth_superframe(AVCodecContext *ctx, int *got_frame_ptr)
1749 {
1750  WMAVoiceContext *s = ctx->priv_data;
1751  GetBitContext *gb = &s->gb, s_gb;
1752  int n, res, n_samples = 480;
1753  double lsps[MAX_FRAMES][MAX_LSPS];
1754  const double *mean_lsf = s->lsps == 16 ?
1756  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1757  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1758  float *samples;
1759 
1760  memcpy(synth, s->synth_history,
1761  s->lsps * sizeof(*synth));
1762  memcpy(excitation, s->excitation_history,
1763  s->history_nsamples * sizeof(*excitation));
1764 
1765  if (s->sframe_cache_size > 0) {
1766  gb = &s_gb;
1768  s->sframe_cache_size = 0;
1769  }
1770 
1771  if ((res = check_bits_for_superframe(gb, s)) == 1) {
1772  *got_frame_ptr = 0;
1773  return 1;
1774  }
1775 
1776  /* First bit is speech/music bit, it differentiates between WMAVoice
1777  * speech samples (the actual codec) and WMAVoice music samples, which
1778  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1779  * the wild yet. */
1780  if (!get_bits1(gb)) {
1781  av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1782  return -1;
1783  }
1784 
1785  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1786  if (get_bits1(gb)) {
1787  if ((n_samples = get_bits(gb, 12)) > 480) {
1788  av_log(ctx, AV_LOG_ERROR,
1789  "Superframe encodes >480 samples (%d), not allowed\n",
1790  n_samples);
1791  return -1;
1792  }
1793  }
1794  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1795  if (s->has_residual_lsps) {
1796  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1797 
1798  for (n = 0; n < s->lsps; n++)
1799  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1800 
1801  if (s->lsps == 10) {
1802  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1803  } else /* s->lsps == 16 */
1804  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1805 
1806  for (n = 0; n < s->lsps; n++) {
1807  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1808  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1809  lsps[2][n] += mean_lsf[n];
1810  }
1811  for (n = 0; n < 3; n++)
1812  stabilize_lsps(lsps[n], s->lsps);
1813  }
1814 
1815  /* get output buffer */
1816  s->frame.nb_samples = 480;
1817  if ((res = ctx->get_buffer(ctx, &s->frame)) < 0) {
1818  av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
1819  return res;
1820  }
1821  s->frame.nb_samples = n_samples;
1822  samples = (float *)s->frame.data[0];
1823 
1824  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1825  for (n = 0; n < 3; n++) {
1826  if (!s->has_residual_lsps) {
1827  int m;
1828 
1829  if (s->lsps == 10) {
1830  dequant_lsp10i(gb, lsps[n]);
1831  } else /* s->lsps == 16 */
1832  dequant_lsp16i(gb, lsps[n]);
1833 
1834  for (m = 0; m < s->lsps; m++)
1835  lsps[n][m] += mean_lsf[m];
1836  stabilize_lsps(lsps[n], s->lsps);
1837  }
1838 
1839  if ((res = synth_frame(ctx, gb, n,
1840  &samples[n * MAX_FRAMESIZE],
1841  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1842  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1843  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1844  *got_frame_ptr = 0;
1845  return res;
1846  }
1847  }
1848 
1849  /* Statistics? FIXME - we don't check for length, a slight overrun
1850  * will be caught by internal buffer padding, and anything else
1851  * will be skipped, not read. */
1852  if (get_bits1(gb)) {
1853  res = get_bits(gb, 4);
1854  skip_bits(gb, 10 * (res + 1));
1855  }
1856 
1857  *got_frame_ptr = 1;
1858 
1859  /* Update history */
1860  memcpy(s->prev_lsps, lsps[2],
1861  s->lsps * sizeof(*s->prev_lsps));
1862  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1863  s->lsps * sizeof(*synth));
1864  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1865  s->history_nsamples * sizeof(*excitation));
1866  if (s->do_apf)
1867  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1868  s->history_nsamples * sizeof(*s->zero_exc_pf));
1869 
1870  return 0;
1871 }
1872 
1881 {
1882  GetBitContext *gb = &s->gb;
1883  unsigned int res;
1884 
1885  if (get_bits_left(gb) < 11)
1886  return 1;
1887  skip_bits(gb, 4); // packet sequence number
1888  s->has_residual_lsps = get_bits1(gb);
1889  do {
1890  res = get_bits(gb, 6); // number of superframes per packet
1891  // (minus first one if there is spillover)
1892  if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1893  return 1;
1894  } while (res == 0x3F);
1896 
1897  return 0;
1898 }
1899 
1915 static void copy_bits(PutBitContext *pb,
1916  const uint8_t *data, int size,
1917  GetBitContext *gb, int nbits)
1918 {
1919  int rmn_bytes, rmn_bits;
1920 
1921  rmn_bits = rmn_bytes = get_bits_left(gb);
1922  if (rmn_bits < nbits)
1923  return;
1924  if (nbits > pb->size_in_bits - put_bits_count(pb))
1925  return;
1926  rmn_bits &= 7; rmn_bytes >>= 3;
1927  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1928  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1929  avpriv_copy_bits(pb, data + size - rmn_bytes,
1930  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1931 }
1932 
1945  int *got_frame_ptr, AVPacket *avpkt)
1946 {
1947  WMAVoiceContext *s = ctx->priv_data;
1948  GetBitContext *gb = &s->gb;
1949  int size, res, pos;
1950 
1951  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1952  * header at each ctx->block_align bytes. However, Libav's ASF demuxer
1953  * feeds us ASF packets, which may concatenate multiple "codec" packets
1954  * in a single "muxer" packet, so we artificially emulate that by
1955  * capping the packet size at ctx->block_align. */
1956  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1957  if (!size) {
1958  *got_frame_ptr = 0;
1959  return 0;
1960  }
1961  init_get_bits(&s->gb, avpkt->data, size << 3);
1962 
1963  /* size == ctx->block_align is used to indicate whether we are dealing with
1964  * a new packet or a packet of which we already read the packet header
1965  * previously. */
1966  if (size == ctx->block_align) { // new packet header
1967  if ((res = parse_packet_header(s)) < 0)
1968  return res;
1969 
1970  /* If the packet header specifies a s->spillover_nbits, then we want
1971  * to push out all data of the previous packet (+ spillover) before
1972  * continuing to parse new superframes in the current packet. */
1973  if (s->spillover_nbits > 0) {
1974  if (s->sframe_cache_size > 0) {
1975  int cnt = get_bits_count(gb);
1976  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1977  flush_put_bits(&s->pb);
1979  if ((res = synth_superframe(ctx, got_frame_ptr)) == 0 &&
1980  *got_frame_ptr) {
1981  cnt += s->spillover_nbits;
1982  s->skip_bits_next = cnt & 7;
1983  *(AVFrame *)data = s->frame;
1984  return cnt >> 3;
1985  } else
1986  skip_bits_long (gb, s->spillover_nbits - cnt +
1987  get_bits_count(gb)); // resync
1988  } else
1989  skip_bits_long(gb, s->spillover_nbits); // resync
1990  }
1991  } else if (s->skip_bits_next)
1992  skip_bits(gb, s->skip_bits_next);
1993 
1994  /* Try parsing superframes in current packet */
1995  s->sframe_cache_size = 0;
1996  s->skip_bits_next = 0;
1997  pos = get_bits_left(gb);
1998  if ((res = synth_superframe(ctx, got_frame_ptr)) < 0) {
1999  return res;
2000  } else if (*got_frame_ptr) {
2001  int cnt = get_bits_count(gb);
2002  s->skip_bits_next = cnt & 7;
2003  *(AVFrame *)data = s->frame;
2004  return cnt >> 3;
2005  } else if ((s->sframe_cache_size = pos) > 0) {
2006  /* rewind bit reader to start of last (incomplete) superframe... */
2007  init_get_bits(gb, avpkt->data, size << 3);
2008  skip_bits_long(gb, (size << 3) - pos);
2009  assert(get_bits_left(gb) == pos);
2010 
2011  /* ...and cache it for spillover in next packet */
2013  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
2014  // FIXME bad - just copy bytes as whole and add use the
2015  // skip_bits_next field
2016  }
2017 
2018  return size;
2019 }
2020 
2022 {
2023  WMAVoiceContext *s = ctx->priv_data;
2024 
2025  if (s->do_apf) {
2026  ff_rdft_end(&s->rdft);
2027  ff_rdft_end(&s->irdft);
2028  ff_dct_end(&s->dct);
2029  ff_dct_end(&s->dst);
2030  }
2031 
2032  return 0;
2033 }
2034 
2036 {
2037  WMAVoiceContext *s = ctx->priv_data;
2038  int n;
2039 
2040  s->postfilter_agc = 0;
2041  s->sframe_cache_size = 0;
2042  s->skip_bits_next = 0;
2043  for (n = 0; n < s->lsps; n++)
2044  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2045  memset(s->excitation_history, 0,
2046  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2047  memset(s->synth_history, 0,
2048  sizeof(*s->synth_history) * MAX_LSPS);
2049  memset(s->gain_pred_err, 0,
2050  sizeof(s->gain_pred_err));
2051 
2052  if (s->do_apf) {
2053  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2054  sizeof(*s->synth_filter_out_buf) * s->lsps);
2055  memset(s->dcf_mem, 0,
2056  sizeof(*s->dcf_mem) * 2);
2057  memset(s->zero_exc_pf, 0,
2058  sizeof(*s->zero_exc_pf) * s->history_nsamples);
2059  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2060  }
2061 }
2062 
2064  .name = "wmavoice",
2065  .type = AVMEDIA_TYPE_AUDIO,
2066  .id = CODEC_ID_WMAVOICE,
2067  .priv_data_size = sizeof(WMAVoiceContext),
2071  .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DR1,
2072  .flush = wmavoice_flush,
2073  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2074 };