wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
28 #define UNCHECKED_BITSTREAM_READER 1
29 
30 #include <math.h>
31 #include "avcodec.h"
32 #include "get_bits.h"
33 #include "put_bits.h"
34 #include "wmavoice_data.h"
35 #include "celp_math.h"
36 #include "celp_filters.h"
37 #include "acelp_vectors.h"
38 #include "acelp_filters.h"
39 #include "lsp.h"
40 #include "libavutil/lzo.h"
41 #include "dct.h"
42 #include "rdft.h"
43 #include "sinewin.h"
44 
45 #define MAX_BLOCKS 8
46 #define MAX_LSPS 16
47 #define MAX_LSPS_ALIGN16 16
48 #define MAX_FRAMES 3
50 #define MAX_FRAMESIZE 160
51 #define MAX_SIGNAL_HISTORY 416
52 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
53 #define SFRAME_CACHE_MAXSIZE 256
55 #define VLC_NBITS 6
57 
58 
62 
66 enum {
69  ACB_TYPE_HAMMING = 2
74 };
77 
81 enum {
83  FCB_TYPE_HARDCODED = 1,
86  FCB_TYPE_AW_PULSES = 2,
90 };
93 
97 static const struct frame_type_desc {
98  uint8_t n_blocks;
99  uint8_t log_n_blocks;
101  uint8_t acb_type;
102  uint8_t fcb_type;
103  uint8_t dbl_pulses;
104  uint16_t frame_size;
107 } frame_descs[17] = {
109  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
110  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
111  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
112  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
113  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
114  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
115  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
116  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
117  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
118  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
119  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
120  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
121  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
122  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
123  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
124  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
125  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
126 };
127 
131 typedef struct {
138  int8_t vbm_tree[25];
142 
144  int history_nsamples;
147 
149  /* postfilter specific values */
150  int do_apf;
151  int denoise_strength;
153  int denoise_tilt_corr;
155  int dc_level;
157 
159  int lsps;
162  int frame_lsp_bitsize;
164  int sframe_lsp_bitsize;
166 
171  int block_pitch_nbits;
173  int block_pitch_range;
176  int block_delta_pitch_hrange;
180  uint16_t block_conv_table[4];
182 
194  int has_residual_lsps;
198  int skip_bits_next;
203 
210  PutBitContext pb;
215 
225  double prev_lsps[MAX_LSPS];
226  int last_pitch_val;
230  float silence_gain;
232 
234  int aw_pulse_range;
236  int aw_n_pulses[2];
242  int aw_first_pulse_off[2];
245  int aw_next_pulse_off_cache;
247 
253  float gain_pred_err[6];
255  float excitation_history[MAX_SIGNAL_HISTORY];
259  float synth_history[MAX_LSPS];
260 
269  RDFTContext rdft, irdft;
270  DCTContext dct, dst;
272  float sin[511], cos[511];
274  float postfilter_agc;
276  float dcf_mem[2];
278  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
281  float denoise_filter_cache[MAX_FRAMESIZE];
283  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
285  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
287  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
290 
294 
304 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
305 {
306  static const uint8_t bits[] = {
307  2, 2, 2, 4, 4, 4,
308  6, 6, 6, 8, 8, 8,
309  10, 10, 10, 12, 12, 12,
310  14, 14, 14, 14
311  };
312  static const uint16_t codes[] = {
313  0x0000, 0x0001, 0x0002, // 00/01/10
314  0x000c, 0x000d, 0x000e, // 11+00/01/10
315  0x003c, 0x003d, 0x003e, // 1111+00/01/10
316  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
317  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
318  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
319  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
320  };
321  int cntr[8], n, res;
322 
323  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
324  memset(cntr, 0, sizeof(cntr));
325  for (n = 0; n < 17; n++) {
326  res = get_bits(gb, 3);
327  if (cntr[res] > 3) // should be >= 3 + (res == 7))
328  return -1;
329  vbm_tree[res * 3 + cntr[res]++] = n;
330  }
331  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
332  bits, 1, 1, codes, 2, 2, 132);
333  return 0;
334 }
335 
340 {
341  int n, flags, pitch_range, lsp16_flag;
342  WMAVoiceContext *s = ctx->priv_data;
343 
352  if (ctx->extradata_size != 46) {
353  av_log(ctx, AV_LOG_ERROR,
354  "Invalid extradata size %d (should be 46)\n",
355  ctx->extradata_size);
356  return -1;
357  }
358  flags = AV_RL32(ctx->extradata + 18);
359  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
360  s->do_apf = flags & 0x1;
361  if (s->do_apf) {
362  ff_rdft_init(&s->rdft, 7, DFT_R2C);
363  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
364  ff_dct_init(&s->dct, 6, DCT_I);
365  ff_dct_init(&s->dst, 6, DST_I);
366 
367  ff_sine_window_init(s->cos, 256);
368  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
369  for (n = 0; n < 255; n++) {
370  s->sin[n] = -s->sin[510 - n];
371  s->cos[510 - n] = s->cos[n];
372  }
373  }
374  s->denoise_strength = (flags >> 2) & 0xF;
375  if (s->denoise_strength >= 12) {
376  av_log(ctx, AV_LOG_ERROR,
377  "Invalid denoise filter strength %d (max=11)\n",
378  s->denoise_strength);
379  return -1;
380  }
381  s->denoise_tilt_corr = !!(flags & 0x40);
382  s->dc_level = (flags >> 7) & 0xF;
383  s->lsp_q_mode = !!(flags & 0x2000);
384  s->lsp_def_mode = !!(flags & 0x4000);
385  lsp16_flag = flags & 0x1000;
386  if (lsp16_flag) {
387  s->lsps = 16;
388  s->frame_lsp_bitsize = 34;
389  s->sframe_lsp_bitsize = 60;
390  } else {
391  s->lsps = 10;
392  s->frame_lsp_bitsize = 24;
393  s->sframe_lsp_bitsize = 48;
394  }
395  for (n = 0; n < s->lsps; n++)
396  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
397 
398  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
399  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
400  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
401  return -1;
402  }
403 
404  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
405  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
406  pitch_range = s->max_pitch_val - s->min_pitch_val;
407  if (pitch_range <= 0) {
408  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
409  return -1;
410  }
411  s->pitch_nbits = av_ceil_log2(pitch_range);
412  s->last_pitch_val = 40;
414  s->history_nsamples = s->max_pitch_val + 8;
415 
417  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
418  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
419 
420  av_log(ctx, AV_LOG_ERROR,
421  "Unsupported samplerate %d (min=%d, max=%d)\n",
422  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
423 
424  return -1;
425  }
426 
427  s->block_conv_table[0] = s->min_pitch_val;
428  s->block_conv_table[1] = (pitch_range * 25) >> 6;
429  s->block_conv_table[2] = (pitch_range * 44) >> 6;
430  s->block_conv_table[3] = s->max_pitch_val - 1;
431  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
432  if (s->block_delta_pitch_hrange <= 0) {
433  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
434  return -1;
435  }
436  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
438  s->block_conv_table[3] + 1 +
439  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
440  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
441 
443 
445  ctx->coded_frame = &s->frame;
446 
447  return 0;
448 }
449 
471 static void adaptive_gain_control(float *out, const float *in,
472  const float *speech_synth,
473  int size, float alpha, float *gain_mem)
474 {
475  int i;
476  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
477  float mem = *gain_mem;
478 
479  for (i = 0; i < size; i++) {
480  speech_energy += fabsf(speech_synth[i]);
481  postfilter_energy += fabsf(in[i]);
482  }
483  gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
484 
485  for (i = 0; i < size; i++) {
486  mem = alpha * mem + gain_scale_factor;
487  out[i] = in[i] * mem;
488  }
489 
490  *gain_mem = mem;
491 }
492 
511 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
512  const float *in, float *out, int size)
513 {
514  int n;
515  float optimal_gain = 0, dot;
516  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
517  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
518  *best_hist_ptr;
519 
520  /* find best fitting point in history */
521  do {
522  dot = ff_dot_productf(in, ptr, size);
523  if (dot > optimal_gain) {
524  optimal_gain = dot;
525  best_hist_ptr = ptr;
526  }
527  } while (--ptr >= end);
528 
529  if (optimal_gain <= 0)
530  return -1;
531  dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size);
532  if (dot <= 0) // would be 1.0
533  return -1;
534 
535  if (optimal_gain <= dot) {
536  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
537  } else
538  dot = 0.625;
539 
540  /* actual smoothing */
541  for (n = 0; n < size; n++)
542  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
543 
544  return 0;
545 }
546 
557 static float tilt_factor(const float *lpcs, int n_lpcs)
558 {
559  float rh0, rh1;
560 
561  rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs);
562  rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1);
563 
564  return rh1 / rh0;
565 }
566 
570 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
571  int fcb_type, float *coeffs, int remainder)
572 {
573  float last_coeff, min = 15.0, max = -15.0;
574  float irange, angle_mul, gain_mul, range, sq;
575  int n, idx;
576 
577  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
578  s->rdft.rdft_calc(&s->rdft, lpcs);
579 #define log_range(var, assign) do { \
580  float tmp = log10f(assign); var = tmp; \
581  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
582  } while (0)
583  log_range(last_coeff, lpcs[1] * lpcs[1]);
584  for (n = 1; n < 64; n++)
585  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
586  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
587  log_range(lpcs[0], lpcs[0] * lpcs[0]);
588 #undef log_range
589  range = max - min;
590  lpcs[64] = last_coeff;
591 
592  /* Now, use this spectrum to pick out these frequencies with higher
593  * (relative) power/energy (which we then take to be "not noise"),
594  * and set up a table (still in lpc[]) of (relative) gains per frequency.
595  * These frequencies will be maintained, while others ("noise") will be
596  * decreased in the filter output. */
597  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
598  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
599  (5.0 / 14.7));
600  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
601  for (n = 0; n <= 64; n++) {
602  float pwr;
603 
604  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
606  lpcs[n] = angle_mul * pwr;
607 
608  /* 70.57 =~ 1/log10(1.0331663) */
609  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
610  if (idx > 127) { // fallback if index falls outside table range
611  coeffs[n] = wmavoice_energy_table[127] *
612  powf(1.0331663, idx - 127);
613  } else
614  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
615  }
616 
617  /* calculate the Hilbert transform of the gains, which we do (since this
618  * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
619  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
620  * "moment" of the LPCs in this filter. */
621  s->dct.dct_calc(&s->dct, lpcs);
622  s->dst.dct_calc(&s->dst, lpcs);
623 
624  /* Split out the coefficient indexes into phase/magnitude pairs */
625  idx = 255 + av_clip(lpcs[64], -255, 255);
626  coeffs[0] = coeffs[0] * s->cos[idx];
627  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
628  last_coeff = coeffs[64] * s->cos[idx];
629  for (n = 63;; n--) {
630  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
631  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
632  coeffs[n * 2] = coeffs[n] * s->cos[idx];
633 
634  if (!--n) break;
635 
636  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
637  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
638  coeffs[n * 2] = coeffs[n] * s->cos[idx];
639  }
640  coeffs[1] = last_coeff;
641 
642  /* move into real domain */
643  s->irdft.rdft_calc(&s->irdft, coeffs);
644 
645  /* tilt correction and normalize scale */
646  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
647  if (s->denoise_tilt_corr) {
648  float tilt_mem = 0;
649 
650  coeffs[remainder - 1] = 0;
651  ff_tilt_compensation(&tilt_mem,
652  -1.8 * tilt_factor(coeffs, remainder - 1),
653  coeffs, remainder);
654  }
655  sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder));
656  for (n = 0; n < remainder; n++)
657  coeffs[n] *= sq;
658 }
659 
686 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
687  float *synth_pf, int size,
688  const float *lpcs)
689 {
690  int remainder, lim, n;
691 
692  if (fcb_type != FCB_TYPE_SILENCE) {
693  float *tilted_lpcs = s->tilted_lpcs_pf,
694  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
695 
696  tilted_lpcs[0] = 1.0;
697  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
698  memset(&tilted_lpcs[s->lsps + 1], 0,
699  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
700  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
701  tilted_lpcs, s->lsps + 2);
702 
703  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
704  * size is applied to the next frame. All input beyond this is zero,
705  * and thus all output beyond this will go towards zero, hence we can
706  * limit to min(size-1, 127-size) as a performance consideration. */
707  remainder = FFMIN(127 - size, size - 1);
708  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
709 
710  /* apply coefficients (in frequency spectrum domain), i.e. complex
711  * number multiplication */
712  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
713  s->rdft.rdft_calc(&s->rdft, synth_pf);
714  s->rdft.rdft_calc(&s->rdft, coeffs);
715  synth_pf[0] *= coeffs[0];
716  synth_pf[1] *= coeffs[1];
717  for (n = 1; n < 64; n++) {
718  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
719  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
720  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
721  }
722  s->irdft.rdft_calc(&s->irdft, synth_pf);
723  }
724 
725  /* merge filter output with the history of previous runs */
726  if (s->denoise_filter_cache_size) {
727  lim = FFMIN(s->denoise_filter_cache_size, size);
728  for (n = 0; n < lim; n++)
729  synth_pf[n] += s->denoise_filter_cache[n];
730  s->denoise_filter_cache_size -= lim;
731  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
733  }
734 
735  /* move remainder of filter output into a cache for future runs */
736  if (fcb_type != FCB_TYPE_SILENCE) {
737  lim = FFMIN(remainder, s->denoise_filter_cache_size);
738  for (n = 0; n < lim; n++)
739  s->denoise_filter_cache[n] += synth_pf[size + n];
740  if (lim < remainder) {
741  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
742  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
743  s->denoise_filter_cache_size = remainder;
744  }
745  }
746 }
747 
768 static void postfilter(WMAVoiceContext *s, const float *synth,
769  float *samples, int size,
770  const float *lpcs, float *zero_exc_pf,
771  int fcb_type, int pitch)
772 {
773  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
774  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
775  *synth_filter_in = zero_exc_pf;
776 
777  assert(size <= MAX_FRAMESIZE / 2);
778 
779  /* generate excitation from input signal */
780  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
781 
782  if (fcb_type >= FCB_TYPE_AW_PULSES &&
783  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
784  synth_filter_in = synth_filter_in_buf;
785 
786  /* re-synthesize speech after smoothening, and keep history */
787  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
788  synth_filter_in, size, s->lsps);
789  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
790  sizeof(synth_pf[0]) * s->lsps);
791 
792  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
793 
794  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
795  &s->postfilter_agc);
796 
797  if (s->dc_level > 8) {
798  /* remove ultra-low frequency DC noise / highpass filter;
799  * coefficients are identical to those used in SIPR decoding,
800  * and very closely resemble those used in AMR-NB decoding. */
802  (const float[2]) { -1.99997, 1.0 },
803  (const float[2]) { -1.9330735188, 0.93589198496 },
804  0.93980580475, s->dcf_mem, size);
805  }
806 }
822 static void dequant_lsps(double *lsps, int num,
823  const uint16_t *values,
824  const uint16_t *sizes,
825  int n_stages, const uint8_t *table,
826  const double *mul_q,
827  const double *base_q)
828 {
829  int n, m;
830 
831  memset(lsps, 0, num * sizeof(*lsps));
832  for (n = 0; n < n_stages; n++) {
833  const uint8_t *t_off = &table[values[n] * num];
834  double base = base_q[n], mul = mul_q[n];
835 
836  for (m = 0; m < num; m++)
837  lsps[m] += base + mul * t_off[m];
838 
839  table += sizes[n] * num;
840  }
841 }
842 
854 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
855 {
856  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
857  static const double mul_lsf[4] = {
858  5.2187144800e-3, 1.4626986422e-3,
859  9.6179549166e-4, 1.1325736225e-3
860  };
861  static const double base_lsf[4] = {
862  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
863  M_PI * -3.3486e-2, M_PI * -5.7408e-2
864  };
865  uint16_t v[4];
866 
867  v[0] = get_bits(gb, 8);
868  v[1] = get_bits(gb, 6);
869  v[2] = get_bits(gb, 5);
870  v[3] = get_bits(gb, 5);
871 
872  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
873  mul_lsf, base_lsf);
874 }
875 
881  double *i_lsps, const double *old,
882  double *a1, double *a2, int q_mode)
883 {
884  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
885  static const double mul_lsf[3] = {
886  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
887  };
888  static const double base_lsf[3] = {
889  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
890  };
891  const float (*ipol_tab)[2][10] = q_mode ?
893  uint16_t interpol, v[3];
894  int n;
895 
896  dequant_lsp10i(gb, i_lsps);
897 
898  interpol = get_bits(gb, 5);
899  v[0] = get_bits(gb, 7);
900  v[1] = get_bits(gb, 6);
901  v[2] = get_bits(gb, 6);
902 
903  for (n = 0; n < 10; n++) {
904  double delta = old[n] - i_lsps[n];
905  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
906  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
907  }
908 
909  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
910  mul_lsf, base_lsf);
911 }
912 
916 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
917 {
918  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
919  static const double mul_lsf[5] = {
920  3.3439586280e-3, 6.9908173703e-4,
921  3.3216608306e-3, 1.0334960326e-3,
922  3.1899104283e-3
923  };
924  static const double base_lsf[5] = {
925  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
926  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
927  M_PI * -1.29816e-1
928  };
929  uint16_t v[5];
930 
931  v[0] = get_bits(gb, 8);
932  v[1] = get_bits(gb, 6);
933  v[2] = get_bits(gb, 7);
934  v[3] = get_bits(gb, 6);
935  v[4] = get_bits(gb, 7);
936 
937  dequant_lsps( lsps, 5, v, vec_sizes, 2,
938  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
939  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
940  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
941  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
942  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
943 }
944 
950  double *i_lsps, const double *old,
951  double *a1, double *a2, int q_mode)
952 {
953  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
954  static const double mul_lsf[3] = {
955  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
956  };
957  static const double base_lsf[3] = {
958  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
959  };
960  const float (*ipol_tab)[2][16] = q_mode ?
962  uint16_t interpol, v[3];
963  int n;
964 
965  dequant_lsp16i(gb, i_lsps);
966 
967  interpol = get_bits(gb, 5);
968  v[0] = get_bits(gb, 7);
969  v[1] = get_bits(gb, 7);
970  v[2] = get_bits(gb, 7);
971 
972  for (n = 0; n < 16; n++) {
973  double delta = old[n] - i_lsps[n];
974  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
975  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
976  }
977 
978  dequant_lsps( a2, 10, v, vec_sizes, 1,
979  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
980  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
981  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
982  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
983  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
984 }
985 
1000  const int *pitch)
1001 {
1002  static const int16_t start_offset[94] = {
1003  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1004  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1005  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1006  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1007  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1008  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1009  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1010  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1011  };
1012  int bits, offset;
1013 
1014  /* position of pulse */
1015  s->aw_idx_is_ext = 0;
1016  if ((bits = get_bits(gb, 6)) >= 54) {
1017  s->aw_idx_is_ext = 1;
1018  bits += (bits - 54) * 3 + get_bits(gb, 2);
1019  }
1020 
1021  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1022  * the distribution of the pulses in each block contained in this frame. */
1023  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1024  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1025  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1026  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1027  offset += s->aw_n_pulses[0] * pitch[0];
1028  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1029  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1030 
1031  /* if continuing from a position before the block, reset position to
1032  * start of block (when corrected for the range over which it can be
1033  * spread in aw_pulse_set1()). */
1034  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1035  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1036  s->aw_first_pulse_off[1] -= pitch[1];
1037  if (start_offset[bits] < 0)
1038  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1039  s->aw_first_pulse_off[0] -= pitch[0];
1040  }
1041 }
1042 
1051  int block_idx, AMRFixed *fcb)
1052 {
1053  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1054  uint16_t *use_mask = use_mask_mem + 2;
1055  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1056  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1057  * of idx are the position of the bit within a particular item in the
1058  * array (0 being the most significant bit, and 15 being the least
1059  * significant bit), and the remainder (>> 4) is the index in the
1060  * use_mask[]-array. This is faster and uses less memory than using a
1061  * 80-byte/80-int array. */
1062  int pulse_off = s->aw_first_pulse_off[block_idx],
1063  pulse_start, n, idx, range, aidx, start_off = 0;
1064 
1065  /* set offset of first pulse to within this block */
1066  if (s->aw_n_pulses[block_idx] > 0)
1067  while (pulse_off + s->aw_pulse_range < 1)
1068  pulse_off += fcb->pitch_lag;
1069 
1070  /* find range per pulse */
1071  if (s->aw_n_pulses[0] > 0) {
1072  if (block_idx == 0) {
1073  range = 32;
1074  } else /* block_idx = 1 */ {
1075  range = 8;
1076  if (s->aw_n_pulses[block_idx] > 0)
1077  pulse_off = s->aw_next_pulse_off_cache;
1078  }
1079  } else
1080  range = 16;
1081  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1082 
1083  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1084  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1085  * we exclude that range from being pulsed again in this function. */
1086  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1087  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1088  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1089  if (s->aw_n_pulses[block_idx] > 0)
1090  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1091  int excl_range = s->aw_pulse_range; // always 16 or 24
1092  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1093  int first_sh = 16 - (idx & 15);
1094  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1095  excl_range -= first_sh;
1096  if (excl_range >= 16) {
1097  *use_mask_ptr++ = 0;
1098  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1099  } else
1100  *use_mask_ptr &= 0xFFFF >> excl_range;
1101  }
1102 
1103  /* find the 'aidx'th offset that is not excluded */
1104  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1105  for (n = 0; n <= aidx; pulse_start++) {
1106  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1107  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1108  if (use_mask[0]) idx = 0x0F;
1109  else if (use_mask[1]) idx = 0x1F;
1110  else if (use_mask[2]) idx = 0x2F;
1111  else if (use_mask[3]) idx = 0x3F;
1112  else if (use_mask[4]) idx = 0x4F;
1113  else return;
1114  idx -= av_log2_16bit(use_mask[idx >> 4]);
1115  }
1116  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1117  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1118  n++;
1119  start_off = idx;
1120  }
1121  }
1122 
1123  fcb->x[fcb->n] = start_off;
1124  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1125  fcb->n++;
1126 
1127  /* set offset for next block, relative to start of that block */
1128  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1129  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1130 }
1131 
1140  int block_idx, AMRFixed *fcb)
1141 {
1142  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1143  float v;
1144 
1145  if (s->aw_n_pulses[block_idx] > 0) {
1146  int n, v_mask, i_mask, sh, n_pulses;
1147 
1148  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1149  n_pulses = 3;
1150  v_mask = 8;
1151  i_mask = 7;
1152  sh = 4;
1153  } else { // 4 pulses, 1:sign + 2:index each
1154  n_pulses = 4;
1155  v_mask = 4;
1156  i_mask = 3;
1157  sh = 3;
1158  }
1159 
1160  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1161  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1162  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1163  s->aw_first_pulse_off[block_idx];
1164  while (fcb->x[fcb->n] < 0)
1165  fcb->x[fcb->n] += fcb->pitch_lag;
1166  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1167  fcb->n++;
1168  }
1169  } else {
1170  int num2 = (val & 0x1FF) >> 1, delta, idx;
1171 
1172  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1173  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1174  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1175  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1176  v = (val & 0x200) ? -1.0 : 1.0;
1177 
1178  fcb->no_repeat_mask |= 3 << fcb->n;
1179  fcb->x[fcb->n] = idx - delta;
1180  fcb->y[fcb->n] = v;
1181  fcb->x[fcb->n + 1] = idx;
1182  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1183  fcb->n += 2;
1184  }
1185 }
1186 
1200 static int pRNG(int frame_cntr, int block_num, int block_size)
1201 {
1202  /* array to simplify the calculation of z:
1203  * y = (x % 9) * 5 + 6;
1204  * z = (49995 * x) / y;
1205  * Since y only has 9 values, we can remove the division by using a
1206  * LUT and using FASTDIV-style divisions. For each of the 9 values
1207  * of y, we can rewrite z as:
1208  * z = x * (49995 / y) + x * ((49995 % y) / y)
1209  * In this table, each col represents one possible value of y, the
1210  * first number is 49995 / y, and the second is the FASTDIV variant
1211  * of 49995 % y / y. */
1212  static const unsigned int div_tbl[9][2] = {
1213  { 8332, 3 * 715827883U }, // y = 6
1214  { 4545, 0 * 390451573U }, // y = 11
1215  { 3124, 11 * 268435456U }, // y = 16
1216  { 2380, 15 * 204522253U }, // y = 21
1217  { 1922, 23 * 165191050U }, // y = 26
1218  { 1612, 23 * 138547333U }, // y = 31
1219  { 1388, 27 * 119304648U }, // y = 36
1220  { 1219, 16 * 104755300U }, // y = 41
1221  { 1086, 39 * 93368855U } // y = 46
1222  };
1223  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1224  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1225  // so this is effectively a modulo (%)
1226  y = x - 9 * MULH(477218589, x); // x % 9
1227  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1228  // z = x * 49995 / (y * 5 + 6)
1229  return z % (1000 - block_size);
1230 }
1231 
1237  int block_idx, int size,
1238  const struct frame_type_desc *frame_desc,
1239  float *excitation)
1240 {
1241  float gain;
1242  int n, r_idx;
1243 
1244  assert(size <= MAX_FRAMESIZE);
1245 
1246  /* Set the offset from which we start reading wmavoice_std_codebook */
1247  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1248  r_idx = pRNG(s->frame_cntr, block_idx, size);
1249  gain = s->silence_gain;
1250  } else /* FCB_TYPE_HARDCODED */ {
1251  r_idx = get_bits(gb, 8);
1252  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1253  }
1254 
1255  /* Clear gain prediction parameters */
1256  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1257 
1258  /* Apply gain to hardcoded codebook and use that as excitation signal */
1259  for (n = 0; n < size; n++)
1260  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1261 }
1262 
1268  int block_idx, int size,
1269  int block_pitch_sh2,
1270  const struct frame_type_desc *frame_desc,
1271  float *excitation)
1272 {
1273  static const float gain_coeff[6] = {
1274  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1275  };
1276  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1277  int n, idx, gain_weight;
1278  AMRFixed fcb;
1279 
1280  assert(size <= MAX_FRAMESIZE / 2);
1281  memset(pulses, 0, sizeof(*pulses) * size);
1282 
1283  fcb.pitch_lag = block_pitch_sh2 >> 2;
1284  fcb.pitch_fac = 1.0;
1285  fcb.no_repeat_mask = 0;
1286  fcb.n = 0;
1287 
1288  /* For the other frame types, this is where we apply the innovation
1289  * (fixed) codebook pulses of the speech signal. */
1290  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1291  aw_pulse_set1(s, gb, block_idx, &fcb);
1292  aw_pulse_set2(s, gb, block_idx, &fcb);
1293  } else /* FCB_TYPE_EXC_PULSES */ {
1294  int offset_nbits = 5 - frame_desc->log_n_blocks;
1295 
1296  fcb.no_repeat_mask = -1;
1297  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1298  * (instead of double) for a subset of pulses */
1299  for (n = 0; n < 5; n++) {
1300  float sign;
1301  int pos1, pos2;
1302 
1303  sign = get_bits1(gb) ? 1.0 : -1.0;
1304  pos1 = get_bits(gb, offset_nbits);
1305  fcb.x[fcb.n] = n + 5 * pos1;
1306  fcb.y[fcb.n++] = sign;
1307  if (n < frame_desc->dbl_pulses) {
1308  pos2 = get_bits(gb, offset_nbits);
1309  fcb.x[fcb.n] = n + 5 * pos2;
1310  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1311  }
1312  }
1313  }
1314  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1315 
1316  /* Calculate gain for adaptive & fixed codebook signal.
1317  * see ff_amr_set_fixed_gain(). */
1318  idx = get_bits(gb, 7);
1319  fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
1320  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1321  acb_gain = wmavoice_gain_codebook_acb[idx];
1322  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1323  -2.9957322736 /* log(0.05) */,
1324  1.6094379124 /* log(5.0) */);
1325 
1326  gain_weight = 8 >> frame_desc->log_n_blocks;
1327  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1328  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1329  for (n = 0; n < gain_weight; n++)
1330  s->gain_pred_err[n] = pred_err;
1331 
1332  /* Calculation of adaptive codebook */
1333  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1334  int len;
1335  for (n = 0; n < size; n += len) {
1336  int next_idx_sh16;
1337  int abs_idx = block_idx * size + n;
1338  int pitch_sh16 = (s->last_pitch_val << 16) +
1339  s->pitch_diff_sh16 * abs_idx;
1340  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1341  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1342  idx = idx_sh16 >> 16;
1343  if (s->pitch_diff_sh16) {
1344  if (s->pitch_diff_sh16 > 0) {
1345  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1346  } else
1347  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1348  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1349  1, size - n);
1350  } else
1351  len = size;
1352 
1353  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1355  idx, 9, len);
1356  }
1357  } else /* ACB_TYPE_HAMMING */ {
1358  int block_pitch = block_pitch_sh2 >> 2;
1359  idx = block_pitch_sh2 & 3;
1360  if (idx) {
1361  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1363  idx, 8, size);
1364  } else
1365  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1366  sizeof(float) * size);
1367  }
1368 
1369  /* Interpolate ACB/FCB and use as excitation signal */
1370  ff_weighted_vector_sumf(excitation, excitation, pulses,
1371  acb_gain, fcb_gain, size);
1372 }
1373 
1391  int block_idx, int size,
1392  int block_pitch_sh2,
1393  const double *lsps, const double *prev_lsps,
1394  const struct frame_type_desc *frame_desc,
1395  float *excitation, float *synth)
1396 {
1397  double i_lsps[MAX_LSPS];
1398  float lpcs[MAX_LSPS];
1399  float fac;
1400  int n;
1401 
1402  if (frame_desc->acb_type == ACB_TYPE_NONE)
1403  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1404  else
1405  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1406  frame_desc, excitation);
1407 
1408  /* convert interpolated LSPs to LPCs */
1409  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1410  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1411  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1412  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1413 
1414  /* Speech synthesis */
1415  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1416 }
1417 
1433 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1434  float *samples,
1435  const double *lsps, const double *prev_lsps,
1436  float *excitation, float *synth)
1437 {
1438  WMAVoiceContext *s = ctx->priv_data;
1439  int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1440  int pitch[MAX_BLOCKS], last_block_pitch;
1441 
1442  /* Parse frame type ("frame header"), see frame_descs */
1443  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1444 
1445  if (bd_idx < 0) {
1446  av_log(ctx, AV_LOG_ERROR,
1447  "Invalid frame type VLC code, skipping\n");
1448  return -1;
1449  }
1450 
1451  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1452 
1453  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1454  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1455  /* Pitch is provided per frame, which is interpreted as the pitch of
1456  * the last sample of the last block of this frame. We can interpolate
1457  * the pitch of other blocks (and even pitch-per-sample) by gradually
1458  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1459  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1460  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1461  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1462  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1463  if (s->last_acb_type == ACB_TYPE_NONE ||
1464  20 * abs(cur_pitch_val - s->last_pitch_val) >
1465  (cur_pitch_val + s->last_pitch_val))
1466  s->last_pitch_val = cur_pitch_val;
1467 
1468  /* pitch per block */
1469  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1470  int fac = n * 2 + 1;
1471 
1472  pitch[n] = (MUL16(fac, cur_pitch_val) +
1473  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1474  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1475  }
1476 
1477  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1478  s->pitch_diff_sh16 =
1479  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1480  }
1481 
1482  /* Global gain (if silence) and pitch-adaptive window coordinates */
1483  switch (frame_descs[bd_idx].fcb_type) {
1484  case FCB_TYPE_SILENCE:
1486  break;
1487  case FCB_TYPE_AW_PULSES:
1488  aw_parse_coords(s, gb, pitch);
1489  break;
1490  }
1491 
1492  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1493  int bl_pitch_sh2;
1494 
1495  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1496  switch (frame_descs[bd_idx].acb_type) {
1497  case ACB_TYPE_HAMMING: {
1498  /* Pitch is given per block. Per-block pitches are encoded as an
1499  * absolute value for the first block, and then delta values
1500  * relative to this value) for all subsequent blocks. The scale of
1501  * this pitch value is semi-logaritmic compared to its use in the
1502  * decoder, so we convert it to normal scale also. */
1503  int block_pitch,
1504  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1505  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1506  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1507 
1508  if (n == 0) {
1509  block_pitch = get_bits(gb, s->block_pitch_nbits);
1510  } else
1511  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1513  /* Convert last_ so that any next delta is within _range */
1514  last_block_pitch = av_clip(block_pitch,
1516  s->block_pitch_range -
1518 
1519  /* Convert semi-log-style scale back to normal scale */
1520  if (block_pitch < t1) {
1521  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1522  } else {
1523  block_pitch -= t1;
1524  if (block_pitch < t2) {
1525  bl_pitch_sh2 =
1526  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1527  } else {
1528  block_pitch -= t2;
1529  if (block_pitch < t3) {
1530  bl_pitch_sh2 =
1531  (s->block_conv_table[2] + block_pitch) << 2;
1532  } else
1533  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1534  }
1535  }
1536  pitch[n] = bl_pitch_sh2 >> 2;
1537  break;
1538  }
1539 
1540  case ACB_TYPE_ASYMMETRIC: {
1541  bl_pitch_sh2 = pitch[n] << 2;
1542  break;
1543  }
1544 
1545  default: // ACB_TYPE_NONE has no pitch
1546  bl_pitch_sh2 = 0;
1547  break;
1548  }
1549 
1550  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1551  lsps, prev_lsps, &frame_descs[bd_idx],
1552  &excitation[n * block_nsamples],
1553  &synth[n * block_nsamples]);
1554  }
1555 
1556  /* Averaging projection filter, if applicable. Else, just copy samples
1557  * from synthesis buffer */
1558  if (s->do_apf) {
1559  double i_lsps[MAX_LSPS];
1560  float lpcs[MAX_LSPS];
1561 
1562  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1563  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1564  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1565  postfilter(s, synth, samples, 80, lpcs,
1566  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1567  frame_descs[bd_idx].fcb_type, pitch[0]);
1568 
1569  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1570  i_lsps[n] = cos(lsps[n]);
1571  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1572  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1573  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1574  frame_descs[bd_idx].fcb_type, pitch[0]);
1575  } else
1576  memcpy(samples, synth, 160 * sizeof(synth[0]));
1577 
1578  /* Cache values for next frame */
1579  s->frame_cntr++;
1580  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1581  s->last_acb_type = frame_descs[bd_idx].acb_type;
1582  switch (frame_descs[bd_idx].acb_type) {
1583  case ACB_TYPE_NONE:
1584  s->last_pitch_val = 0;
1585  break;
1586  case ACB_TYPE_ASYMMETRIC:
1587  s->last_pitch_val = cur_pitch_val;
1588  break;
1589  case ACB_TYPE_HAMMING:
1590  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1591  break;
1592  }
1593 
1594  return 0;
1595 }
1596 
1609 static void stabilize_lsps(double *lsps, int num)
1610 {
1611  int n, m, l;
1612 
1613  /* set minimum value for first, maximum value for last and minimum
1614  * spacing between LSF values.
1615  * Very similar to ff_set_min_dist_lsf(), but in double. */
1616  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1617  for (n = 1; n < num; n++)
1618  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1619  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1620 
1621  /* reorder (looks like one-time / non-recursed bubblesort).
1622  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1623  for (n = 1; n < num; n++) {
1624  if (lsps[n] < lsps[n - 1]) {
1625  for (m = 1; m < num; m++) {
1626  double tmp = lsps[m];
1627  for (l = m - 1; l >= 0; l--) {
1628  if (lsps[l] <= tmp) break;
1629  lsps[l + 1] = lsps[l];
1630  }
1631  lsps[l + 1] = tmp;
1632  }
1633  break;
1634  }
1635  }
1636 }
1637 
1648  WMAVoiceContext *s)
1649 {
1650  GetBitContext s_gb, *gb = &s_gb;
1651  int n, need_bits, bd_idx;
1652  const struct frame_type_desc *frame_desc;
1653 
1654  /* initialize a copy */
1655  init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1656  skip_bits_long(gb, get_bits_count(orig_gb));
1657  assert(get_bits_left(gb) == get_bits_left(orig_gb));
1658 
1659  /* superframe header */
1660  if (get_bits_left(gb) < 14)
1661  return 1;
1662  if (!get_bits1(gb))
1663  return -1; // WMAPro-in-WMAVoice superframe
1664  if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1665  if (s->has_residual_lsps) { // residual LSPs (for all frames)
1666  if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1667  return 1;
1669  }
1670 
1671  /* frames */
1672  for (n = 0; n < MAX_FRAMES; n++) {
1673  int aw_idx_is_ext = 0;
1674 
1675  if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1676  if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1678  }
1679  bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1680  if (bd_idx < 0)
1681  return -1; // invalid frame type VLC code
1682  frame_desc = &frame_descs[bd_idx];
1683  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1684  if (get_bits_left(gb) < s->pitch_nbits)
1685  return 1;
1686  skip_bits_long(gb, s->pitch_nbits);
1687  }
1688  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1689  skip_bits(gb, 8);
1690  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1691  int tmp = get_bits(gb, 6);
1692  if (tmp >= 0x36) {
1693  skip_bits(gb, 2);
1694  aw_idx_is_ext = 1;
1695  }
1696  }
1697 
1698  /* blocks */
1699  if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1700  need_bits = s->block_pitch_nbits +
1701  (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1702  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1703  need_bits = 2 * !aw_idx_is_ext;
1704  } else
1705  need_bits = 0;
1706  need_bits += frame_desc->frame_size;
1707  if (get_bits_left(gb) < need_bits)
1708  return 1;
1709  skip_bits_long(gb, need_bits);
1710  }
1711 
1712  return 0;
1713 }
1714 
1735 static int synth_superframe(AVCodecContext *ctx, int *got_frame_ptr)
1736 {
1737  WMAVoiceContext *s = ctx->priv_data;
1738  GetBitContext *gb = &s->gb, s_gb;
1739  int n, res, n_samples = 480;
1740  double lsps[MAX_FRAMES][MAX_LSPS];
1741  const double *mean_lsf = s->lsps == 16 ?
1743  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1744  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1745  float *samples;
1746 
1747  memcpy(synth, s->synth_history,
1748  s->lsps * sizeof(*synth));
1749  memcpy(excitation, s->excitation_history,
1750  s->history_nsamples * sizeof(*excitation));
1751 
1752  if (s->sframe_cache_size > 0) {
1753  gb = &s_gb;
1755  s->sframe_cache_size = 0;
1756  }
1757 
1758  if ((res = check_bits_for_superframe(gb, s)) == 1) {
1759  *got_frame_ptr = 0;
1760  return 1;
1761  }
1762 
1763  /* First bit is speech/music bit, it differentiates between WMAVoice
1764  * speech samples (the actual codec) and WMAVoice music samples, which
1765  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1766  * the wild yet. */
1767  if (!get_bits1(gb)) {
1768  av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1769  return -1;
1770  }
1771 
1772  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1773  if (get_bits1(gb)) {
1774  if ((n_samples = get_bits(gb, 12)) > 480) {
1775  av_log(ctx, AV_LOG_ERROR,
1776  "Superframe encodes >480 samples (%d), not allowed\n",
1777  n_samples);
1778  return -1;
1779  }
1780  }
1781  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1782  if (s->has_residual_lsps) {
1783  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1784 
1785  for (n = 0; n < s->lsps; n++)
1786  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1787 
1788  if (s->lsps == 10) {
1789  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1790  } else /* s->lsps == 16 */
1791  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1792 
1793  for (n = 0; n < s->lsps; n++) {
1794  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1795  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1796  lsps[2][n] += mean_lsf[n];
1797  }
1798  for (n = 0; n < 3; n++)
1799  stabilize_lsps(lsps[n], s->lsps);
1800  }
1801 
1802  /* get output buffer */
1803  s->frame.nb_samples = 480;
1804  if ((res = ctx->get_buffer(ctx, &s->frame)) < 0) {
1805  av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
1806  return res;
1807  }
1808  s->frame.nb_samples = n_samples;
1809  samples = (float *)s->frame.data[0];
1810 
1811  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1812  for (n = 0; n < 3; n++) {
1813  if (!s->has_residual_lsps) {
1814  int m;
1815 
1816  if (s->lsps == 10) {
1817  dequant_lsp10i(gb, lsps[n]);
1818  } else /* s->lsps == 16 */
1819  dequant_lsp16i(gb, lsps[n]);
1820 
1821  for (m = 0; m < s->lsps; m++)
1822  lsps[n][m] += mean_lsf[m];
1823  stabilize_lsps(lsps[n], s->lsps);
1824  }
1825 
1826  if ((res = synth_frame(ctx, gb, n,
1827  &samples[n * MAX_FRAMESIZE],
1828  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1829  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1830  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1831  *got_frame_ptr = 0;
1832  return res;
1833  }
1834  }
1835 
1836  /* Statistics? FIXME - we don't check for length, a slight overrun
1837  * will be caught by internal buffer padding, and anything else
1838  * will be skipped, not read. */
1839  if (get_bits1(gb)) {
1840  res = get_bits(gb, 4);
1841  skip_bits(gb, 10 * (res + 1));
1842  }
1843 
1844  *got_frame_ptr = 1;
1845 
1846  /* Update history */
1847  memcpy(s->prev_lsps, lsps[2],
1848  s->lsps * sizeof(*s->prev_lsps));
1849  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1850  s->lsps * sizeof(*synth));
1851  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1852  s->history_nsamples * sizeof(*excitation));
1853  if (s->do_apf)
1854  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1855  s->history_nsamples * sizeof(*s->zero_exc_pf));
1856 
1857  return 0;
1858 }
1859 
1868 {
1869  GetBitContext *gb = &s->gb;
1870  unsigned int res;
1871 
1872  if (get_bits_left(gb) < 11)
1873  return 1;
1874  skip_bits(gb, 4); // packet sequence number
1875  s->has_residual_lsps = get_bits1(gb);
1876  do {
1877  res = get_bits(gb, 6); // number of superframes per packet
1878  // (minus first one if there is spillover)
1879  if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1880  return 1;
1881  } while (res == 0x3F);
1883 
1884  return 0;
1885 }
1886 
1902 static void copy_bits(PutBitContext *pb,
1903  const uint8_t *data, int size,
1904  GetBitContext *gb, int nbits)
1905 {
1906  int rmn_bytes, rmn_bits;
1907 
1908  rmn_bits = rmn_bytes = get_bits_left(gb);
1909  if (rmn_bits < nbits)
1910  return;
1911  if (nbits > pb->size_in_bits - put_bits_count(pb))
1912  return;
1913  rmn_bits &= 7; rmn_bytes >>= 3;
1914  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1915  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1916  avpriv_copy_bits(pb, data + size - rmn_bytes,
1917  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1918 }
1919 
1932  int *got_frame_ptr, AVPacket *avpkt)
1933 {
1934  WMAVoiceContext *s = ctx->priv_data;
1935  GetBitContext *gb = &s->gb;
1936  int size, res, pos;
1937 
1938  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1939  * header at each ctx->block_align bytes. However, Libav's ASF demuxer
1940  * feeds us ASF packets, which may concatenate multiple "codec" packets
1941  * in a single "muxer" packet, so we artificially emulate that by
1942  * capping the packet size at ctx->block_align. */
1943  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1944  if (!size) {
1945  *got_frame_ptr = 0;
1946  return 0;
1947  }
1948  init_get_bits(&s->gb, avpkt->data, size << 3);
1949 
1950  /* size == ctx->block_align is used to indicate whether we are dealing with
1951  * a new packet or a packet of which we already read the packet header
1952  * previously. */
1953  if (size == ctx->block_align) { // new packet header
1954  if ((res = parse_packet_header(s)) < 0)
1955  return res;
1956 
1957  /* If the packet header specifies a s->spillover_nbits, then we want
1958  * to push out all data of the previous packet (+ spillover) before
1959  * continuing to parse new superframes in the current packet. */
1960  if (s->spillover_nbits > 0) {
1961  if (s->sframe_cache_size > 0) {
1962  int cnt = get_bits_count(gb);
1963  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1964  flush_put_bits(&s->pb);
1966  if ((res = synth_superframe(ctx, got_frame_ptr)) == 0 &&
1967  *got_frame_ptr) {
1968  cnt += s->spillover_nbits;
1969  s->skip_bits_next = cnt & 7;
1970  *(AVFrame *)data = s->frame;
1971  return cnt >> 3;
1972  } else
1973  skip_bits_long (gb, s->spillover_nbits - cnt +
1974  get_bits_count(gb)); // resync
1975  } else
1976  skip_bits_long(gb, s->spillover_nbits); // resync
1977  }
1978  } else if (s->skip_bits_next)
1979  skip_bits(gb, s->skip_bits_next);
1980 
1981  /* Try parsing superframes in current packet */
1982  s->sframe_cache_size = 0;
1983  s->skip_bits_next = 0;
1984  pos = get_bits_left(gb);
1985  if ((res = synth_superframe(ctx, got_frame_ptr)) < 0) {
1986  return res;
1987  } else if (*got_frame_ptr) {
1988  int cnt = get_bits_count(gb);
1989  s->skip_bits_next = cnt & 7;
1990  *(AVFrame *)data = s->frame;
1991  return cnt >> 3;
1992  } else if ((s->sframe_cache_size = pos) > 0) {
1993  /* rewind bit reader to start of last (incomplete) superframe... */
1994  init_get_bits(gb, avpkt->data, size << 3);
1995  skip_bits_long(gb, (size << 3) - pos);
1996  assert(get_bits_left(gb) == pos);
1997 
1998  /* ...and cache it for spillover in next packet */
2000  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
2001  // FIXME bad - just copy bytes as whole and add use the
2002  // skip_bits_next field
2003  }
2004 
2005  return size;
2006 }
2007 
2009 {
2010  WMAVoiceContext *s = ctx->priv_data;
2011 
2012  if (s->do_apf) {
2013  ff_rdft_end(&s->rdft);
2014  ff_rdft_end(&s->irdft);
2015  ff_dct_end(&s->dct);
2016  ff_dct_end(&s->dst);
2017  }
2018 
2019  return 0;
2020 }
2021 
2023 {
2024  WMAVoiceContext *s = ctx->priv_data;
2025  int n;
2026 
2027  s->postfilter_agc = 0;
2028  s->sframe_cache_size = 0;
2029  s->skip_bits_next = 0;
2030  for (n = 0; n < s->lsps; n++)
2031  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2032  memset(s->excitation_history, 0,
2033  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2034  memset(s->synth_history, 0,
2035  sizeof(*s->synth_history) * MAX_LSPS);
2036  memset(s->gain_pred_err, 0,
2037  sizeof(s->gain_pred_err));
2038 
2039  if (s->do_apf) {
2040  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2041  sizeof(*s->synth_filter_out_buf) * s->lsps);
2042  memset(s->dcf_mem, 0,
2043  sizeof(*s->dcf_mem) * 2);
2044  memset(s->zero_exc_pf, 0,
2045  sizeof(*s->zero_exc_pf) * s->history_nsamples);
2046  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2047  }
2048 }
2049 
2051  .name = "wmavoice",
2052  .type = AVMEDIA_TYPE_AUDIO,
2053  .id = CODEC_ID_WMAVOICE,
2054  .priv_data_size = sizeof(WMAVoiceContext),
2058  .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DR1,
2059  .flush = wmavoice_flush,
2060  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2061 };