From 17c5966045b463fde45418000b03c95eb5cd7e09 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Fri, 17 Feb 2012 16:09:21 -0500 Subject: Last updates for draft -11 - Draft updates - Updated code to produce and check test vectors - Making sure that the test vectors pass at all rates as well as for mono and stereo --- Makefile.draft | 2 +- celt/bands.c | 8 +- celt/celt.c | 2 +- configure.ac | 2 +- doc/build_draft.sh | 11 + doc/draft-ietf-codec-opus.xml | 1648 ++++++++++++++++++++++------------------- silk/dec_API.c | 20 +- silk/decoder_set_fs.c | 16 - src/opus_compare.c | 70 +- src/opus_decoder.c | 2 +- src/opus_demo.c | 164 +++- tests/run_vectors.sh | 43 +- 12 files changed, 1176 insertions(+), 812 deletions(-) diff --git a/Makefile.draft b/Makefile.draft index 0f084a5c..501f76eb 100644 --- a/Makefile.draft +++ b/Makefile.draft @@ -20,7 +20,7 @@ CFLAGS := -Drestrict= $(CFLAGS) ###################### END OF OPTIONS ###################### -CFLAGS += -DOPUS_VERSION='"0.9.8"' +CFLAGS += -DOPUS_VERSION='"0.9.9"' include silk_sources.mk include celt_sources.mk include opus_sources.mk diff --git a/celt/bands.c b/celt/bands.c index 1d49386c..68b36261 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -238,22 +238,22 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas celt_norm *X; opus_val16 prev1; opus_val16 prev2; - opus_val16 Ediff; + opus_val32 Ediff; opus_val16 r; int renormalize=0; prev1 = prev1logE[c*m->nbEBands+i]; prev2 = prev2logE[c*m->nbEBands+i]; - if (CnbEBands+i]); prev2 = MAX16(prev2,prev2logE[m->nbEBands+i]); } - Ediff = logE[c*m->nbEBands+i]-MIN16(prev1,prev2); + Ediff = EXTEND32(logE[c*m->nbEBands+i])-EXTEND32(MIN16(prev1,prev2)); Ediff = MAX16(0, Ediff); #ifdef FIXED_POINT if (Ediff < 16384) - r = 2*MIN16(16383,SHR32(celt_exp2(-Ediff),1)); + r = 2*MIN16(16383,SHR32(celt_exp2(-EXTRACT16(Ediff)),1)); else r = 0; if (LM==3) diff --git a/celt/celt.c b/celt/celt.c index 6c1eb6b8..ddf65fad 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -2392,7 +2392,7 @@ int celt_decode_with_ec(CELTDecoder * restrict st, const unsigned char *data, in dec = &_dec; } - if (Cmode->nbEBands;i++) oldBandE[i]=MAX16(oldBandE[i],oldBandE[st->mode->nbEBands+i]); diff --git a/configure.ac b/configure.ac index 112b99d2..c54bade9 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) OPUS_MAJOR_VERSION=0 OPUS_MINOR_VERSION=9 -OPUS_MICRO_VERSION=8 +OPUS_MICRO_VERSION=9 OPUS_EXTRA_VERSION= OPUS_VERSION="$OPUS_MAJOR_VERSION.$OPUS_MINOR_VERSION.$OPUS_MICRO_VERSION$OPUS_EXTRA_VERSION" diff --git a/doc/build_draft.sh b/doc/build_draft.sh index 4d95574a..7809ee83 100755 --- a/doc/build_draft.sh +++ b/doc/build_draft.sh @@ -50,6 +50,17 @@ cat opus_source.tar.gz| base64 | tr -d '\n' | fold -w 64 | \ #echo '' >> opus_compare_escaped.c #echo '' >> opus_compare_escaped.c +echo '
' > testvectors_sha1 +echo '' >> testvectors_sha1 +echo '> testvectors_sha1 +(cd ../opus_testvectors; sha1sum *.bit *.dec) >> testvectors_sha1 +#cd opus_testvectors +#sha1sum *.bit *.dec >> ../testvectors_sha1 +#cd .. +echo ']]>' >> testvectors_sha1 +echo '' >> testvectors_sha1 +echo '
' >> testvectors_sha1 + echo running xml2rfc xml2rfc draft-ietf-codec-opus.xml draft-ietf-codec-opus.html & xml2rfc draft-ietf-codec-opus.xml diff --git a/doc/draft-ietf-codec-opus.xml b/doc/draft-ietf-codec-opus.xml index a6739a1b..448c2e93 100644 --- a/doc/draft-ietf-codec-opus.xml +++ b/doc/draft-ietf-codec-opus.xml @@ -2,7 +2,7 @@ - + Definition of the Opus Audio Codec @@ -53,7 +53,7 @@ - + General @@ -65,7 +65,7 @@ This document defines the Opus interactive speech and audio codec. Opus is designed to handle a wide range of interactive audio applications, including Voice over IP, videoconferencing, in-game chat, and even live, distributed music performances. -It scales from low bit-rate narrowband speech at 6 kb/s to very high quality +It scales from low bitrate narrowband speech at 6 kb/s to very high quality stereo music at 510 kb/s. Opus uses both linear prediction (LP) and the Modified Discrete Cosine Transform (MDCT) to achieve good compression of both speech and music. @@ -78,7 +78,7 @@ Opus uses both linear prediction (LP) and the Modified Discrete Cosine
The Opus codec is a real-time interactive audio codec designed to meet the requirements -described in . +described in . It is composed of a linear prediction (LP)-based layer and a Modified Discrete Cosine Transform (MDCT)-based layer. @@ -96,11 +96,11 @@ The primary normative part of this specification is provided by the source code in . Only the decoder portion of this software is normative, though a significant amount of code is shared by both the encoder and decoder. - -The decoder contains significant amounts of integer and fixed-point arithmetic - which must be performed exactly, including all rounding considerations, so any - useful specification must make extensive use of domain-specific symbolic - language to adequately define these operations. + provides a decoder conformance test. +The decoder contains a great deal of integer and fixed-point arithmetic which + must be performed exactly, including all rounding considerations, so any + useful specification requires domain-specific symbolic language to adequately + define these operations. Additionally, any conflict between the symbolic representation and the included reference implementation must be resolved. For the practical reasons of compatibility and @@ -112,7 +112,6 @@ For these reasons this RFC uses the reference implementation as the sole symbolic representation of the codec. - While the symbolic representation is unambiguous and complete it is not always the easiest way to understand the codec's operation. For this reason this document also describes significant parts of the codec in English and @@ -150,8 +149,8 @@ E.g., the text will explicitly indicate any shifts required after a Expressions, where included in the text, follow C operator rules and - precedence, with the exception that the syntax "x**y" is used to indicate x - raised to the power y. + precedence, with the exception that the syntax "x**y" indicates x raised to + the power y. The text also makes use of the following functions: @@ -279,7 +278,8 @@ The LP layer is based on the . It supports NB, MB, or WB audio and frame sizes from 10 ms to 60 ms, and requires an additional 5 ms look-ahead for noise shaping estimation. - A small additional delay (up to 1.2 ms) may be required for sampling rate conversion. +A small additional delay (up to 1.5 ms) may be required for sampling rate + conversion. Like Vorbis and many other modern codecs, SILK is inherently designed for variable-bitrate (VBR) coding, though the encoder can also produce constant-bitrate (CBR) streams. @@ -360,70 +360,75 @@ Although the LP layer is VBR, the bit allocation of the MDCT layer can produce The Opus codec includes a number of control parameters which can be changed dynamically during regular operation of the codec, without interrupting the audio stream from the encoder to the decoder. -These parameters only affect the encoder since any impact they have on the bit-stream is signalled -in-band such that a decoder can decode any Opus stream without any out-of-band signalling. Any Opus +These parameters only affect the encoder since any impact they have on the bit-stream is signaled +in-band such that a decoder can decode any Opus stream without any out-of-band signaling. Any Opus implementation can add or modify these control parameters without affecting interoperability. The most important encoder control parameters in the reference encoder are listed below. -
+
-Opus supports all bitrates from 6 kb/s to 510 kb/s. All other parameters being -equal, higher bit-rate results in higher quality. For a frame size of 20 ms, these +Opus supports all bitrates from 6 kb/s to 510 kb/s. All other parameters being +equal, higher bitrate results in higher quality. For a frame size of 20 ms, these are the bitrate "sweet spots" for Opus in various configurations: -8-12 kb/s for narrowband speech -16-20 kb/s for wideband speech -28-40 kb/s for fullband speech -48-64 kb/s for fullband mono music -64-128 kb/s for fullband stereo music +8-12 kb/s for NB speech, +16-20 kb/s for WB speech, +28-40 kb/s for FB speech, +48-64 kb/s for FB mono music, and +64-128 kb/s for FB stereo music.
-
+
-Opus can transmit either mono or stereo audio within one stream. When -decoding a mono stream in stereo, the left and right channels will be -identical and when decoding a stereo channel in mono, the mono output -will be the average of the encoded left and right channels. In some cases -it is desirable to encode a stereo input stream in mono (e.g. because the -bit-rate is insufficient for good quality stereo). The number of channels -encoded can be selected in real-time, but by default the reference encoder -attempts to make the best decision possible given the current bitrate. +Opus can transmit either mono or stereo frames within a single stream. +When decoding a mono frame in a stereo decoder, the left and right channels are + identical, and when decoding a stereo frame in a mono decoder, the mono output + is the average of the left and right channels. +In some cases, it is desirable to encode a stereo input stream in mono (e.g., + because the bitrate is too low to encode stereo with sufficient quality). +The number of channels encoded can be selected in real-time, but by default the + reference encoder attempts to make the best decision possible given the + current bitrate.
-
+
-The audio bandwidths supported by Opus are listed in -. Just like for the number of channels, -any decoder can decode audio encoded at any bandwidth. For example, any Opus -decoder operating at 8 kHz can decode a fullband Opus stream and any Opus decoder -operating at 48 kHz can decode a narrowband stream. Similarly, the reference encoder -can take a 48 kHz input signal and encode it in narrowband. The higher the audio -bandwidth, the higher the required bitrate to achieve acceptable quality. +The audio bandwidths supported by Opus are listed in + . +Just like for the number of channels, any decoder can decode audio encoded at + any bandwidth. +For example, any Opus decoder operating at 8 kHz can decode a FB Opus + frame, and any Opus decoder operating at 48 kHz can decode a NB frame. +Similarly, the reference encoder can take a 48 kHz input signal and + encode it as NB. +The higher the audio bandwidth, the higher the required bitrate to achieve + acceptable quality. The audio bandwidth can be explicitly specified in real-time, but by default -the reference encoder attempts to make the best bandwidth decision possible given -the current bitrate. + the reference encoder attempts to make the best bandwidth decision possible + given the current bitrate.
-
+
-Opus can encode frames of 2.5, 5, 10, 20, 40 or 60 ms. It can also combine -multiple frames into packets of up to 120 ms. Because of the overhead from -IP/UDP/RTP headers, sending fewer packets per second reduces the -bitrate, but increases latency and sensitivity to packet losses as -losing one packet constitutes a loss of a bigger chunk of audio -signal. Increasing the frame duration also slightly improves coding -efficiency, but the gain becomes small for frame sizes above 20 ms. For -this reason, 20 ms frames tend to be a good choice for most applications. +Opus can encode frames of 2.5, 5, 10, 20, 40 or 60 ms. +It can also combine multiple frames into packets of up to 120 ms. +For real-time applications, sending fewer packets per second reduces the + bitrate, since it reduces the overhead from IP, UDP, and RTP headers. +However, it increases latency and sensitivity to packet losses, as losing one + packet constitutes a loss of a bigger chunk of audio. +Increasing the frame duration also slightly improves coding efficiency, but the + gain becomes small for frame sizes above 20 ms. +For this reason, 20 ms frames are a good choice for most applications.
-
+
There are various aspects of the Opus encoding process where trade-offs can be made between CPU complexity and quality/bitrate. In the reference @@ -431,16 +436,17 @@ encoder, the complexity is selected using an integer from 0 to 10, where 0 is the lowest complexity and 10 is the highest. Examples of computations for which such trade-offs may occur are: -the filter order of the pitch analysis whitening filter the short-term noise shaping filter; +The order of the pitch analysis whitening filter, +The order of the short-term noise shaping filter, The number of states in delayed decision quantization of the -residual signal; +residual signal, and The use of certain bit-stream features such as variable time-frequency -resolution and pitch post-filter. +resolution and the pitch post-filter.
-
+
Audio codecs often exploit inter-frame correlations to reduce the bitrate at a cost in error propagation: after losing one packet @@ -451,21 +457,21 @@ choose a trade-off between bitrate and amount of error propagation.
-
+
- Another mechanism providing robustness against packet loss is the in- - band Forward Error Correction (FEC). Packets that are determined to + Another mechanism providing robustness against packet loss is the in-band + Forward Error Correction (FEC). Packets that are determined to contain perceptually important speech information, such as onsets or transients, are encoded again at a lower bitrate and this re-encoded information is added to a subsequent packet.
-
+
Opus is more efficient when operating with variable bitrate (VBR), which is -the default. However, in some (rare) applications, constant bit-rate (CBR) -is required. There are two main reasons to operate in CBR mode: +the default. However, in some (rare) applications, constant bitrate (CBR) +is required. There are two main reasons to operate in CBR mode: When the transport only supports a fixed size for each compressed frame When security is important and the input audio @@ -480,7 +486,7 @@ CBR due to the bit reservoir).
-
+
Discontinuous Transmission (DTX) reduces the bitrate during silence or background noise. When DTX is enabled, only one frame is encoded @@ -573,8 +579,8 @@ For example, configuration 0 has a 10 ms frame size and configuration 3 -One additional bit, labeled "s", is used to signal mono vs. stereo, with 0 - indicating mono and 1 indicating stereo. +One additional bit, labeled "s", signals mono vs. stereo, with 0 indicating + mono and 1 indicating stereo. @@ -606,19 +612,22 @@ This section describes how frames are packed according to each possible value
When a packet contains multiple VBR frames (i.e., code 2 or 3), the compressed - length of one or more of these frames is indicated with a one or two byte + length of one or more of these frames is indicated with a one- or two-byte sequence, with the meaning of the first byte as follows: 0: No frame (discontinuous transmission (DTX) or lost packet) - 1...251: Length of the frame in bytes 252...255: A second byte is needed. The total length is (len[1]*4)+len[0] + +The special length 0 indicates that no frame is available, either because it + was dropped during transmission by some intermediary or because the encoder + chose not to transmit it. +A length of 0 is valid for any Opus frame in any mode. + + The maximum representable length is 255*4+255=1275 bytes. For 20 ms frames, this represents a bitrate of 510 kb/s, which is @@ -691,7 +700,7 @@ The number of payload bytes available for compressed data, N-1, MUST be even
-For code 2 packets, the TOC byte is followed by a one or two byte sequence +For code 2 packets, the TOC byte is followed by a one- or two-byte sequence indicating the length of the first frame (marked N1 in the figure below), followed by N1 bytes of compressed data for the first frame. The remaining N-N1-2 or N-N1-3 bytes are the compressed data for the @@ -703,7 +712,7 @@ For example, a 1-byte code 2 packet is always invalid, and a 2-byte code 2 The length of the first frame, N1, MUST also be no larger than the size of the payload remaining after decoding that length for all code 2 packets. This makes, for example, a 2-byte code 2 packet with a second byte in the range - 1...250 invalid as well (the only valid 2-byte code 2 packet is one where the + 1...251 invalid as well (the only valid 2-byte code 2 packet is one where the length of both frames is zero).
@@ -773,7 +782,7 @@ Then P MUST be no more than N-2. In the CBR case, the compressed length of each frame in bytes is equal to the number of remaining bytes in the packet after subtracting the (optional) padding, (N-2-P), divided by M. -This number MUST be an integer multiple of M. +This number MUST be a non-negative integer multiple of M. The compressed data for all M frames then follows, each of size (N-2-P)/M bytes, as illustrated in . @@ -809,7 +818,7 @@ The compressed data for all M frames then follows, each of size In the VBR case, the (optional) padding length is followed by M-1 frame lengths (indicated by "N1" to "N[M-1]" in the figure below), each encoded in a - one or two byte sequence as described above. + one- or two-byte sequence as described above. The packet MUST contain enough data for the M-1 lengths after removing the (optional) padding, and the sum of these lengths MUST be no larger than the number of bytes remaining in the packet after decoding them. @@ -933,7 +942,7 @@ These constraints are summarized here for reference: The length of a CBR code 3 packet, N, is at least two bytes, the size of the padding, P (including both the padding length bytes in the header and the trailing padding bytes) is no more than N-2, and the frame count, M, satisfies - the constraint that (N-2-P) is an integer multiple of M. + the constraint that (N-2-P) is a non-negative integer multiple of M. VBR code 3 packets are large enough to contain all the header bytes (TOC byte, frame count byte, any padding length bytes, and any frame length bytes), plus the length of the first M-1 frames, plus any trailing padding bytes. @@ -1020,27 +1029,27 @@ The parameters needed to encode or decode symbol k in this context are represented by a three-tuple (fl[k], fh[k], ft), with 0 <= fl[k] < fh[k] <= ft <= 65535. The values of this tuple are derived from the probability model for the - symbol, represented by traditional "frequency counts". Because Opus - uses static contexts these are not updated as symbols are decoded. + symbol, represented by traditional "frequency counts". +Because Opus uses static contexts these are not updated as symbols are decoded. Let f[i] be the frequency of symbol i. Then the three-tuple corresponding to symbol k is given by
The range decoder extracts the symbols and integers encoded using the range encoder in . The range decoder maintains an internal state vector composed of the two-tuple - (val,rng), representing the difference between the high end of the current - range and the actual coded value, minus one, and the size of the current - range, respectively. + (val, rng), representing the difference between the high end of the + current range and the actual coded value, minus one, and the size of the + current range, respectively. Both val and rng are 32-bit unsigned integer values. The decoder initializes rng to 128 and initializes val to 127 minus the top 7 bits of the first input octet. @@ -1062,7 +1071,9 @@ The second step updates the range decoder state with the three-tuple The first step is implemented by ec_decode() (entdec.c), which computes
The divisions here are exact integer division. @@ -1074,19 +1085,25 @@ The decoder then identifies the symbol in the current context corresponding to It uses this tuple to update val according to
If fl[k] is greater than zero, then the decoder updates rng using
Otherwise, it updates rng using
@@ -1169,15 +1186,15 @@ The reference implementation uses three additional decoding methods that are exactly equivalent to the above, but make assumptions and simplifications that allow for a more efficient implementation. -
+
The first is ec_decode_bin() (entdec.c), defined using the parameter ftb instead of ft. It is mathematically equivalent to calling ec_decode() with - ft = (1<<ftb), but avoids one of the divisions. + ft = (1<<ftb), but avoids one of the divisions.
-
+
The next is ec_dec_bit_logp() (entdec.c), which decodes a single binary symbol, replacing both the ec_decode() and ec_dec_update() steps. @@ -1185,16 +1202,17 @@ The context is described by a single parameter, logp, which is the absolute value of the base-2 logarithm of the probability of a "1". It is mathematically equivalent to calling ec_decode() with ft = (1<<logp), followed by ec_dec_update() with - the 3-tuple (fl[k] = 0, fh[k] = (1<<logp)-1, + the 3-tuple (fl[k] = 0, + fh[k] = (1<<logp) - 1, ft = (1<<logp)) if the returned value - of fs is less than (1<<logp)-1 (a "0" was decoded), and with - (fl[k] = (1<<logp)-1, + of fs is less than (1<<logp) - 1 (a "0" was decoded), and with + (fl[k] = (1<<logp) - 1, fh[k] = ft = (1<<logp)) otherwise (a "1" was decoded). The implementation requires no multiplications or divisions.
-
+
The last is ec_dec_icdf() (entdec.c), which decodes a single symbol with a table-based context of up to 8 bits, also replacing both the ec_decode() and @@ -1203,7 +1221,7 @@ The context is described by two parameters, an icdf ("inverse" cumulative distribution function) table and ftb. As with ec_decode_bin(), (1<<ftb) is equivalent to ft. idcf[k], on the other hand, stores (1<<ftb)-fh[k], which is equal to - (1<<ftb)-fl[k+1]. + (1<<ftb) - fl[k+1]. fl[0] is assumed to be 0, and the table is terminated by a value of 0 (where fh[k] == ft). @@ -1211,9 +1229,10 @@ fl[0] is assumed to be 0, and the table is terminated by a value of 0 (where The function is mathematically equivalent to calling ec_decode() with ft = (1<<ftb), using the returned value fs to search the table for the first entry where fs < (1<<ftb)-icdf[k], and - calling ec_dec_update() with fl[k] = (1<<ftb)-icdf[k-1] (or 0 - if k == 0), fh[k] = (1<<ftb)-idcf[k], and - ft = (1<<ftb). + calling ec_dec_update() with + fl[k] = (1<<ftb) - icdf[k-1] (or 0 + if k == 0), fh[k] = (1<<ftb) - idcf[k], + and ft = (1<<ftb). Combining the search with the update allows the division to be replaced by a series of multiplications (which are usually much cheaper), and using an inverse CDF allows the use of an ftb as large as 8 in an 8-bit table without @@ -1227,7 +1246,7 @@ Although icdf[k] is more convenient for the code, the frequency counts, f[k], (PDF) for a given symbol. Therefore this draft lists the latter, not the former, when describing the context in which a symbol is coded as a list, e.g., {4, 4, 4, 4}/16 for a - uniform context with four possible values and ft=16. + uniform context with four possible values and ft = 16. The value of ft after the slash is always the sum of the entries in the PDF, but is included for convenience. Contexts with identical probabilities, f[k]/ft, but different values of ft @@ -1262,40 +1281,52 @@ The format should render it impossible to attempt to read more raw bits than
-The ec_dec_uint() (entdec.c) function decodes one of ft equiprobable values in - the range 0 to ft-1, inclusive, each with a frequency of 1, where ft may be as - large as 2**32-1. -Because ec_decode() is limited to a total frequency of 2**16-1, this is split - up into a range coded symbol representing up to 8 of the high bits of the - value, and, if necessary, raw bits representing the remaining bits. +The function ec_dec_uint() (entdec.c) decodes one of ft equiprobable values in + the range 0 to (ft - 1), inclusive, each with a frequency of 1, + where ft may be as large as (2**32 - 1). +Because ec_decode() is limited to a total frequency of (2**16 - 1), + it splits up the value into a range coded symbol representing up to 8 of the + high bits, and, if necessary, raw bits representing the remainder of the + value. The limit of 8 bits in the range coded symbol is a trade-off between implementation complexity, modeling error (since the symbols no longer truly have equal coding cost), and rounding error introduced by the range coder itself (which gets larger as more bits are included). Using raw bits reduces the maximum number of divisions required in the worst case, but means that it may be possible to decode a value outside the range - 0 to ft-1, inclusive. + 0 to (ft - 1), inclusive. ec_dec_uint() takes a single, positive parameter, ft, which is not necessarily a power of two, and returns an integer, t, whose value lies between 0 and - ft-1, inclusive. -Let ftb = ilog(ft-1), i.e., the number of bits required to store ft-1 in two's - complement notation. -If ftb is 8 or less, then t is decoded with t = ec_decode(ft), and the range - coder state is updated using the three-tuple (t,t+1,ft). + (ft - 1), inclusive. +Let ftb = ilog(ft - 1), i.e., the number of bits required + to store (ft - 1) in two's complement notation. +If ftb is 8 or less, then t is decoded with t = ec_decode(ft), and + the range coder state is updated using the three-tuple (t, t + 1, + ft). If ftb is greater than 8, then the top 8 bits of t are decoded using - t = ec_decode((ft-1>>ftb-8)+1), +
+> (ftb - 8)) + 1) , +]]> +
the decoder state is updated using the three-tuple - (t,t+1,(ft-1>>ftb-8)+1), and the remaining bits are decoded as raw bits, - setting t = t<<ftb-8|ec_dec_bits(ftb-8). + (t, t + 1, + ((ft - 1) >> (ftb - 8)) + 1), + and the remaining bits are decoded as raw bits, setting +
+ +
If, at this point, t >= ft, then the current frame is corrupt. In that case, the decoder should assume there has been an error in the coding, decoding, or transmission and SHOULD take measures to conceal the - error and/or report to the application that a problem has occurred. + error and/or report to the application that the error has occurred.
@@ -1329,8 +1360,8 @@ However, this error is bounded, and periodic calls to ec_tell() or ec_tell_frac() at precisely defined points in the decoding process prevent it from accumulating. For a range coder symbol that requires a whole number of bits (i.e., - for which ft/(fh[k]-fl[k]) is a power of two), where there are at least p - 1/8th bits available, decoding the symbol will never cause ec_tell() or + for which ft/(fh[k] - fl[k]) is a power of two), where there are at + least p 1/8th bits available, decoding the symbol will never cause ec_tell() or ec_tell_frac() to exceed the size of the frame ("bust the budget"). In this case the return value of ec_tell_frac() will only advance by more than p 1/8th bits if there was an additional, fractional number of bits remaining, @@ -1429,9 +1460,9 @@ When used in a SWB or FB Hybrid frame, the LP layer itself still only runs in
-An overview of the decoder is given in . +An overview of the decoder is given in . -
+
. | 6 | +------------+ +-------------+ +-->| Stereo |-->| Sample Rate |--> - 8 | Unmixing | 7 | Conversion | 8 + | Unmixing | 7 | Conversion | 8 +------------+ +-------------+ 1: Range encoded bitstream @@ -1463,7 +1494,6 @@ An overview of the decoder is given in . 8: Resampled signal ]]> -Decoder block diagram.
@@ -1540,7 +1570,8 @@ Figures  mono and stereo, respectively. - + Symbol(s) PDF(s) Condition @@ -1565,9 +1596,6 @@ Figures  - -Organization of the SILK layer of an Opus frame. -
) follows SILK frame. - + Symbol(s) PDF(s) Condition @@ -1803,7 +1832,7 @@ The quantized excitation signal (see ) follows Normalized LSF Interpolation Weight - +20 ms frame Primary Pitch Lag @@ -1847,11 +1876,8 @@ The quantized excitation signal (see ) follows Excitation Signs - + - -Order of the symbols in an individual SILK frame. -
). + (see ). @@ -2115,6 +2141,26 @@ The 3 least significant bits are decoded using a uniform PDF: {32, 32, 32, 32, 32, 32, 32, 32}/256 + +These 6 bits are combined to form a gain index between 0 and 63. +When the gain for the previous subframe is available, then the current gain is + limited as follows: +
+ +
+This may help some implementations limit the change in precision of their + internal LTP history. +The indices which this clamp applies to cannot simply be removed from the + codebook, because the previous gain index will not be available after packet + loss. +This step is skipped after a decoder reset, and in the side channel if the + previous frame in the side channel was not coded, since there is no previous + gain index. +It MAY also be skipped after packet loss. +
+ For subframes which do not have an independent gain (including the first subframe of frames not listed as using independent coding above), the @@ -2137,12 +2183,10 @@ The following formula translates this index into a quantization gain for the current subframe using the gain from the previous subframe:
-The value here is not clamped at 0, and may reach values as low as -16 over the - course of consecutive subframes within a single Opus frame.
silk_gains_dequant() (gain_quant.c) dequantizes log_gain for the k'th subframe @@ -2158,21 +2202,15 @@ The function silk_log2lin() (log2lin.c) computes an approximation of 2**(inLog_Q7/128.0), where inLog_Q7 is its Q7 input. Let i = inLog_Q7>>7 be the integer part of inLogQ7 and f = inLog_Q7&127 be the fractional part. -If i < 16, then +Then
>16)+f)>>7)*(1<>16)+f)*((1<>7) ]]>
yields the approximate exponential. -Otherwise, silk_log2lin uses -
->16)+f)*((1<>7) . -]]> -
-The final Q16 gain values lies between 4096 and 1686110208, inclusive - (representing scale factors of 0.0625 to 25728, respectively). +The final Q16 gain values lies between 81920 and 1686110208, inclusive + (representing scale factors of 1.25 to 25728, respectively).
@@ -2399,7 +2437,7 @@ Which PDF is used for which coefficient is driven by the index, I1, i  o  k  o  o  m  n  m  o  n  m  m  n  l  l  l 9 k  j  i  i  i  i  i  i  i  i  i  i  i  i  i  i -j0 +10 i  j  i  i  i  i  i  i  i  i  i  i  i  i  i  j 11 k  k  l  m  n  l  l  l  l  l  l  l  k  k  j  l @@ -2516,7 +2554,7 @@ Then, the stage-2 residual for each coefficient is computed via
>8 : 0) - + ((((I2[k]<<10) + sign(I2[k])*102)*qstep)>>16) , + + ((((I2[k]<<10) - sign(I2[k])*102)*qstep)>>16) , ]]>
where qstep is the Q16 quantization step size, which is 11796 for NB and MB @@ -2589,7 +2627,7 @@ res_Q10[k] = (k+1 < d_LPC ? (res_Q10[k+1]*pred_Q8[k])>>8 : 0) 28 A A B A B B A B A 29 -A A A B A A A A A +B A A B A A A A A 30 A A A B B A B A B 31 @@ -2613,7 +2651,7 @@ res_Q10[k] = (k+1 < d_LPC ? (res_Q10[k+1]*pred_Q8[k])>>8 : 0) 4 C  D  D  C  D  C  D  D  C  D  D  D  D  D  C 5 -C  D  C  C  C  C  C  C  C  C  C  C  C  C  C +C  C  D  C  C  C  C  C  C  C  C  C  C  C  C 6 D  C  C  C  C  C  C  C  C  C  C  D  C  D  C 7 @@ -2867,7 +2905,8 @@ Given the stage-1 codebook entry cb1_Q8[], the stage-2 residual res_Q10[], and coefficients are
where the division is exact integer division. @@ -2883,7 +2922,6 @@ The next section describes a stabilization procedure used to make these
- The normalized LSF stabilization procedure is implemented in silk_NLSF_stabilize() (NLSF_stabilize.c). @@ -2994,9 +3032,13 @@ For 20 ms SILK frames, the first half of the frame (i.e., the first two A Q2 interpolation factor follows the LSF coefficient indices in the bitstream, which is decoded using the PDF in . This happens in silk_decode_indices() (decode_indices.c). -For the first frame after a decoder reset (see ), - when no prior LSF coefficients are available, the decoder still decodes this - factor, but ignores its value and always uses 4 instead. +After either + +An uncoded regular SILK frame in the side channel, or +A decoder reset (see ), + + the decoder still decodes this factor, but ignores its value and always uses + 4 instead. For 10 ms SILK frames, this factor is not stored at all. @@ -3114,88 +3156,88 @@ Let i = (n[k] >> 8) be the integer index and Then the re-ordered, approximated cosine, c_Q17[ordering[k]], is
> 4 , +c_Q17[ordering[k]] = (cos_Q12[i]*256 + + (cos_Q12[i+1]-cos_Q12[i])*f + 4) >> 3 , ]]>
where ordering[k] is the k'th entry of the column of corresponding to the current audio - bandwidth and cos_Q13[i] is the i'th entry of . + bandwidth and cos_Q12[i] is the i'th entry of . + title="Q12 Cosine Table for LSF Conversion"> i +0 +1 +2 +3 0 - 8192 8190 8182 8170 + 4096 4095 4091 4085 4 - 8152 8130 8104 8072 + 4076 4065 4052 4036 8 - 8034 7994 7946 7896 + 4017 3997 3973 3948 12 - 7840 7778 7714 7644 + 3920 3889 3857 3822 16 - 7568 7490 7406 7318 + 3784 3745 3703 3659 20 - 7226 7128 7026 6922 + 3613 3564 3513 3461 24 - 6812 6698 6580 6458 + 3406 3349 3290 3229 28 - 6332 6204 6070 5934 + 3166 3102 3035 2967 32 - 5792 5648 5502 5352 + 2896 2824 2751 2676 36 - 5198 5040 4880 4718 + 2599 2520 2440 2359 40 - 4552 4382 4212 4038 + 2276 2191 2106 2019 44 - 3862 3684 3502 3320 + 1931 1842 1751 1660 48 - 3136 2948 2760 2570 + 1568 1474 1380 1285 52 - 2378 2186 1990 1794 + 1189 1093 995 897 56 - 1598 1400 1202 1002 + 799 700 601 501 60 - 802 602 402 202 + 401 301 201 101 64 - 0 -202 -402 -602 + 0 -101 -201 -301 68 - -802-1002-1202-1400 + -401 -501 -601 -700 72 --1598-1794-1990-2186 + -799 -897 -995 -1093 76 --2378-2570-2760-2948 +-1189-1285-1380-1474 80 --3136-3320-3502-3684 +-1568-1660-1751-1842 84 --3862-4038-4212-4382 +-1931-2019-2106-2191 88 --4552-4718-4880-5040 +-2276-2359-2440-2520 92 --5198-5352-5502-5648 +-2599-2676-2751-2824 96 --5792-5934-6070-6204 +-2896-2967-3035-3102 100 --6332-6458-6580-6698 +-3166-3229-3290-3349 104 --6812-6922-7026-7128 +-3406-3461-3513-3564 108 --7226-7318-7406-7490 +-3613-3659-3703-3745 112 --7568-7644-7714-7778 +-3784-3822-3857-3889 116 --7840-7896-7946-7994 +-3920-3948-3973-3997 120 --8034-8072-8104-8130 +-4017-4036-4052-4065 124 --8152-8170-8182-8190 +-4076-4085-4091-4095 128 --8192 +-4096 @@ -3310,7 +3352,7 @@ After 10 rounds of bandwidth expansion are performed, they are simply saturated to 16 bits:
> 5, 32767) << 5 . +a32_Q17[k] = clamp(-32768, (a32_Q17[k] + 16) >> 5, 32767) << 5 . ]]>
Because this performs the actual saturation in the Q12 domain, but converts the @@ -3418,7 +3460,7 @@ a32_Q24[k-1][n] = (num_Q24[k-1][n]*gain_Qb1[k] + (1<<(b1[k]-1))) >> b1[k] , ]]>
- where 0 <= n < k-1. + where 0 <= n < k. Here, rc_Q30[k] are the reflection coefficients. div_Q30[k] is the denominator for each iteration, and gain_Qb1[k] is its multiplicative inverse (with b1[k] fractional bits, where b1[k] ranges from @@ -3551,11 +3593,11 @@ If the resulting value is zero, it falls back to the absolute coding procedure Otherwise, the final primary pitch lag is then
- where lag_prev is the primary pitch lag from the most recent frame in the same - channel and delta_lag_index is the value just decoded. + where previous_lag is the primary pitch lag from the most recent frame in the + same channel and delta_lag_index is the value just decoded. This allows a per-frame change in the pitch lag of -8 to +11 samples. The decoder does no clamping at this point, so this value can fall outside the range of 2 ms to 18 ms, and the decoder must use this unclamped @@ -3953,7 +3995,7 @@ Frames that do not code the scaling parameter use the default factor of 15565 As described in , SILK uses a linear congruential generator (LCG) to inject pseudorandom noise into the - quantized excitation + quantized excitation. To ensure synchronization of this process between the encoder and decoder, each SILK frame stores a 2-bit seed after the LTP parameters (if any). The encoder may consider the choice of seed during quantization, and the @@ -4238,8 +4280,10 @@ After the decoder reads the pulse locations for all blocks, it reads the LSBs (if any) for each block in turn. Inside each block, it reads all the LSBs for each coefficient in turn, even those where no pulses were allocated, before proceeding to the next one. -They are coded from most significant to least significant, and they all use the - PDF in . +For 10 ms MB frames, it reads LSBs even for the extra 8 samples in + the last block. +The LSBs are coded from most significant to least significant, and they all use + the PDF in . @@ -4348,13 +4392,13 @@ The constant quantization offset varies depending on the signal type and title="Excitation Quantization Offsets"> Signal Type Quantization Offset Type -Quantization Offset (Q25) -Inactive Low 100 -Inactive High 240 -Unvoiced Low 100 -Unvoiced High 240 -Voiced Low 32 -Voiced High 100 +Quantization Offset (Q23) +Inactive Low 25 +Inactive High 60 +Unvoiced Low 25 +Unvoiced High 60 +Voiced Low 8 +Voiced High 25 @@ -4367,23 +4411,22 @@ Additionally, let seed be the current pseudorandom seed, which is initialized to the value decoded from for the first sample in the current SILK frame, and updated for each subsequent sample according to the procedure below. -Finally, let offset_Q25 be the quantization offset from +Finally, let offset_Q23 be the quantization offset from . Then the following procedure produces the final reconstructed excitation value, - e_Q25[i]: + e_Q23[i]:
When e_raw[i] is zero, sign() returns 0 by the definition in - , so the 80 term does not get added. - offset does not get added. -The final e_Q25[i] value may require more than 16 bits per sample, but will not - require more than 25, including the sign. + , so the factor of 20 does not get added. +The final e_Q23[i] value may require more than 16 bits per sample, but will not + require more than 23, including the sign.
@@ -4439,31 +4482,24 @@ The LTP filter requires LPC residual values from before the current subframe as However, since the LPCs may have changed, it obtains this residual by "rewhitening" the corresponding output signal using the LPCs from the current subframe. -Let e_Q25[i] be the excitation, and out[i] be the fully reconstructed output - signal from previous subframes (see ), or - zeros in the first subframe for this channel after either +Let out[i] for + (j - pitch_lags[s] - d_LPC - 2) <= i < j + be the fully reconstructed output signal from the last + (pitch_lags[s] + d_LPC + 2) samples of previous subframes + (see ), where pitch_lags[s] is the pitch + lag for the current subframe from . +During reconstruction of the first subframe for this channel after either -An uncoded regular SILK frame in the side channel, or -A decoder reset (see ). +An uncoded regular SILK frame (if this is the side channel), or +A decoder reset (see ), -
- - -Let LTP_scale_Q14 be the LTP scaling parameter from - for the first two subframes in any SILK - frame, as well as the last two subframes in a 20 ms SILK frame where - w_Q2 == 4. -Otherwise let LTP_scale_Q14 be 16384 (corresponding to 1.0). -Then, for i such that - (j - pitch_lags[s] - d_LPC - 2) <= i < j, - where pitch_lags[s] is the pitch lag for the current subframe from - , out[i] is rewhitened into an LPC residual, + out[] is rewhitened into an LPC residual, res[i], via
This requires storage to buffer up to 306 values of out[i] from previous subframes. -This corresponds to WB with a maximum of 18 ms * 16 kHz - samples of pitch lag, plus 2 samples for the width of the LTP filter, plus 16 - samples for d_LPC. +This corresponds to WB with a maximum pitch lag of + 18 ms * 16 kHz samples, plus 16 samples for d_LPC, plus 2 + samples for the width of the LTP filter. -Let b_Q7[k] be the coefficients of the LTP filter taken from the - codebook entry in one of +Let e_Q23[i] for j <= i < (j + n) be the + excitation for the current subframe, and b_Q7[k] for + 0 <= k < 5 be the coefficients of the LTP filter + taken from the codebook entry in one of Tables  through  corresponding to the index decoded for the current subframe in @@ -4490,11 +4528,11 @@ Then for i such that j <= i < (j + n), the LPC residual is
@@ -4505,9 +4543,9 @@ For unvoiced frames, the LPC residual for copy of the excitation signal, i.e.,
@@ -4518,11 +4556,12 @@ res[i] = ---------- LPC synthesis uses the short-term LPC filter to predict the next output coefficient. For i such that (j - d_LPC) <= i < j, let - lpc[i] be the result of LPC synthesis from the previous subframe, or zeros in - the first subframe for this channel after either + lpc[i] be the result of LPC synthesis from the last d_LPC samples of the + previous subframe, or zeros in the first subframe for this channel after + either -An uncoded regular SILK frame in the side channel, or -A decoder reset (see ). +An uncoded regular SILK frame (if this is the side channel), or +A decoder reset (see ). Then for i such that j <= i < (j + n), the result of LPC synthesis for the current subframe is @@ -4616,7 +4655,7 @@ Then for i such that j <= i < (j + n2), right[i] = clamp(-1.0, (1 - w1)*mid[i-1] - side[i-1] - w0*p0, 1.0) . ]]>
-These formulas require twp samples prior to index j, the start of the +These formulas require two samples prior to index j, the start of the frame, for the mid channel, and one prior sample for the side channel. For the first frame after a decoder reset, zeros are used instead.
@@ -4641,7 +4680,7 @@ However, a minimum amount of delay is imposed to allow the resampler to operate, and this delay is normative, so that the corresponding delay can be applied to the MDCT layer in the encoder. A decoder is always free to use a resampler which requires more delay than - allowed for here (e.g., to improve quality), but then it most delay the output + allowed for here (e.g., to improve quality), but it must then delay the output of the MDCT layer by this extra amount. Keeping as much delay as possible on the encoder side allows an encoder which knows it will never use any of the SILK or Hybrid modes to skip this delay. @@ -4653,27 +4692,42 @@ By contrast, if it were all applied by the decoder, then a decoder which gives the maximum resampler delay in samples at 48 kHz for each SILK audio bandwidth. -The reference implementation is able to resample to any of the supported - output sampling rates (8, 12, 16, 24, or 48 kHz) within or near this - delay constraint. Because the actual output rate may not be 48 kHz, it may not be possible to achieve exactly these delays while using a whole number of input or output samples. +The reference implementation is able to resample to any of the supported + output sampling rates (8, 12, 16, 24, or 48 kHz) within or near this + delay constraint. Some resampling filters (including those used by the reference implementation) - may add a delay that is not itself an exact integer at either rate. -However, such deviations are unlikely to be perceptible. + may add a delay that is not an exact integer, or is not linear-phase, and so + cannot be represented by a single delay at all frequencies. +However, such deviations are unlikely to be perceptible, and the comparison + tool described in is designed to be relatively + insensitive to them. The delays listed here are the ones that should be targeted by the encoder. Audio Bandwidth -Delay in Samples at 48 kHz -NB 18 -MB 32 -WB 24 +Delay in millisecond +NB 0.538 +MB 0.692 +WB 0.706 + +NB is given a smaller decoder delay allocation than MB and WB to allow a + higher-order filter when resampling to 8 kHz in both the encoder and + decoder. +This implies that the audio content of two SILK frames operating at different + bandwidths are not perfectly aligned in time. +This is not an issue for any transitions described in + , because they all involve a SILK decoder reset. +When the decoder is reset, any samples remaining in the resampling buffer + are discarded, and the resampler is re-initialized with silence. + +
@@ -4699,9 +4753,9 @@ An overview of the decoder is given in . | ^ | +---------+ | | | | Range | | +----------+ v -| Decoder |-+ | Bit | +-----+ -+---------+ | |Allocation| | 2^x | - | +----------+ +-----+ +| Decoder |-+ | Bit | +------+ ++---------+ | |Allocation| | 2**x | + | +----------+ +------+ | | | | v v +--------+ | +---------+ +---+ +-------+ | pitch | @@ -4717,7 +4771,8 @@ An overview of the decoder is given in . The decoder is based on the following symbols and sets of symbols: - + Symbol(s) PDF Condition @@ -4742,7 +4797,6 @@ The decoder is based on the following symbols and sets of symbols: residual anti-collapse{1, 1}/2 finalize -Order of the symbols in the CELT section of the bitstream. @@ -4860,7 +4914,7 @@ Intra-band masking is the strongest of the perceptual masking effects. This stru means that the ideal allocation is more consistent from frame to frame than it is for other codecs without an equivalent structure. -Because the bit allocation is used to drive the decoding of the range-coder +Because the bit allocation drives the decoding of the range-coder stream, it MUST be recovered exactly so that identical coding decisions are made in the encoder and decoder. Any deviation from the reference's resulting bit allocation will result in corrupted output, though implementers are @@ -5010,7 +5064,7 @@ decode the trim value using the inverse CDF {127, 126, 124, 119, 109, 87, 41, 19 the allocation process, then one anti-collapse bit is reserved in the allocation process so it can be decoded later. Following the the anti-collapse reservation, one bit is reserved for skip if available. -For stereo frames, bits are reserved for intensity stereo and for dual stereo. Intensity stereo +For stereo frames, bits are reserved for intensity stereo and for dual stereo. Intensity stereo requires ilog2(end-start) bits. Those bits are reserved if there is enough bits left. Following this, one bit is reserved for dual stereo if available. @@ -5092,7 +5146,7 @@ and the whole balance are applied, respectively. Decoding of PVQ vectors is implemented in decode_pulses() (cwrs.c). The unique codeword index is decoded as a uniformly-distributed integer value between 0 and -V(N,K)-1, where V(N,K) is the number of possible combinations of K pulses in +V(N,K)-1, where V(N,K) is the number of possible combinations of K pulses in N samples. The index is then converted to a vector in the same way specified in . The indexing is based on the calculation of V(N,K) (denoted N(L,K) in ). @@ -5167,7 +5221,7 @@ R(x_N-2, X_N-1), ..., R(x_1, x_2). If the decoded vector represents more than one time block, then the following process is applied separately on each time block. -Also, if each block represents 8 samples or more, then another N-D rotation, by +Also, if each block represents 8 samples or more, then another N-D rotation, by (pi/2-theta), is applied before the rotation described above. This extra rotation is applied in an interleaved manner with a stride equal to round(sqrt(N/nb_blocks)) @@ -5193,13 +5247,14 @@ of stereo audio. The time-frequency (TF) parameters are used to control the time-frequency resolution tradeoff in each coded band. For each band, there are two possible TF choices. For the first band coded, the PDF is {3, 1}/4 for frames marked as transient and {15, 1}/16 for -the other frames. For subsequent bands, the TF choice is coded relative to the +the other frames. For subsequent bands, the TF choice is coded relative to the previous TF choice with probability {15, 1}/15 for transient frames and {31, 1}/32 otherwise. The mapping between the decoded TF choices and the adjustment in TF resolution is shown in the tables below. - + Frame size (ms) 0 1 @@ -5207,10 +5262,10 @@ resolution is shown in the tables below. 5 0 -1 10 0 -2 20 0 -2 -TF adjustments for non-transient frames and tf_select=0 - + Frame size (ms) 0 1 @@ -5218,11 +5273,11 @@ resolution is shown in the tables below. 5 0 -2 10 0 -3 20 0 -3 -TF adjustments for non-transient frames and tf_select=1 - + Frame size (ms) 0 1 @@ -5230,10 +5285,10 @@ resolution is shown in the tables below. 5 1 0 10 2 0 20 3 0 -TF adjustments for transient frames and tf_select=0 - + Frame size (ms) 0 1 @@ -5241,7 +5296,6 @@ resolution is shown in the tables below. 5 1 -1 10 1 -1 20 1 -1 -TF adjustments for transient frames and tf_select=1 @@ -5250,9 +5304,9 @@ while a positive TF adjustment means that the frequency resolution is increased. Changes in TF resolution are implemented using the Hadamard transform. To increase the time resolution by N, N "levels" of the Hadamard transform are applied to the decoded vector for each interleaved MDCT vector. To increase the frequency resolution -(assumes a transient frame), then N levels of the Hadamard transform are applied +(assumes a transient frame), then N levels of the Hadamard transform are applied across the interleaved MDCT vector. In the case of increased -time resolution the decoder uses the "sequency order" because the input vector +time resolution the decoder uses the "sequency order" because the input vector is sorted in time.
@@ -5286,18 +5340,18 @@ multiplied by the square root of the decoded energy. This is done by denormalise The inverse MDCT implementation has no special characteristics. The input is N frequency-domain samples and the output is 2*N time-domain -samples, while scaling by 1/2. A "low-overlap" window is used to reduce the algorithmic delay. +samples, while scaling by 1/2. A "low-overlap" window reduces the algorithmic delay. It is derived from a basic (full overlap) 240-sample version of the window used by the Vorbis codec:
-The low-overlap window is created by zero-padding the basic window and inserting ones in the -middle, such that the resulting window still satisfies power complementarity. The IMDCT and +The low-overlap window is created by zero-padding the basic window and inserting ones in the +middle, such that the resulting window still satisfies power complementarity. The IMDCT and windowing are performed by mdct_backward (mdct.c).
@@ -5419,8 +5473,6 @@ periodic, and if so what the period is, using the OPUS_GET_PITCH() request.
- - Switching between the Opus coding modes, audio bandwidths, and channel counts requires careful consideration to avoid audible glitches. @@ -5446,7 +5498,7 @@ However, other transitions between SILK-only packets or between NB or MB SILK new sample rate. These switches SHOULD be delayed by the encoder until quiet periods or transients, where the inevitable glitches will be less audible. Additionally, - the bit-stream MAY include redundant side information ("redundancy"), in the + the bit-stream MAY include redundant side information ("redundancy"), in the form of additional CELT frames embedded in each of the Opus frames around the transition. @@ -5468,7 +5520,7 @@ To avoid or reduces glitches during these problematic mode transitions, and A transition between coding the lower frequencies with the LP model and the - MDCT model or a transition that involves changing the SILK bandwidth + MDCT model or a transition that involves changing the SILK bandwidth is only normatively specified when it includes redundancy. For those without redundancy, it is RECOMMENDED that the decoder use a concealment technique (e.g., make use of a PLC algorithm) to "fill in" the @@ -5618,7 +5670,6 @@ If the redundancy belongs at the beginning (in a CELT-only to SILK-only or Hybrid transition), the final reconstructed output uses the first 2.5 ms of audio output by the decoder for the redundant frame is as-is, discarding the corresponding output from the SILK-only or Hybrid portion of the frame. - The remaining 2.5 ms is cross-lapped with the decoded SILK/Hybrid signal using the CELT's power-complementary MDCT window to ensure a smooth transition. @@ -5661,8 +5712,8 @@ When switching from CELT-only mode to SILK-only or Hybrid mode with redundancy, illustrates all of the normative transitions involving a mode change, an audio bandwidth change, or both. -Each one uses an S, H, or C to represent an Opus frames in the corresponding - modes. +Each one uses an S, H, or C to represent an Opus frame in the corresponding + mode. In addition, an R indicates the presence of redundancy in the Opus frame it is cross-lapped with. Its location in the first or last 5 ms is assumed to correspond to whether @@ -5673,9 +5724,11 @@ Finally, a c indicates the contents of the CELT overlap buffer after the
S -> S ;S -> S -> S - & & +SILK to SILK with Redundancy: S -> S -> S + & !R -> R + & + ;S -> S -> S NB or MB SILK to Hybrid with Redundancy: S -> S -> S & @@ -5687,9 +5740,11 @@ SILK to CELT with Redundancy: S -> S -> S & !R -> C -> C -> C -Hybrid to NB or MB SILK with Redundancy: H -> H -> H ;S -> S -> S - & & +Hybrid to NB or MB SILK with Redundancy: H -> H -> H + & !R -> R + & + ;S -> S -> S Hybrid to WB SILK: H -> H -> H -> c \ + @@ -5759,6 +5814,7 @@ Key: S SILK-only frame ; SILK decoder reset H Hybrid frame | CELT and SILK decoder resets C CELT-only frame ! CELT decoder reset +c CELT overlap + Direct mixing P Packet Loss Concealment & Windowed cross-lap ]]>
@@ -5782,25 +5838,25 @@ Encoders SHOULD NOT use other transitions, e.g., those that involve redundancy Just like the decoder, the Opus encoder also normally consists of two main blocks: the SILK encoder and the CELT encoder. However, unlike the case of the decoder, a valid (though potentially suboptimal) Opus encoder is not required to support all modes and -may thus only include a SILK encoder module or a CELT encoder module. +may thus only include a SILK encoder module or a CELT encoder module. The output bit-stream of the Opus encoding contains bits from the SILK and CELT - encoders, though these are not separable due to the use of a range coder. + encoders, though these are not separable due to the use of a range coder. A block diagram of the encoder is illustrated below. -
+
| rate |--->|encoder|--+ - +-----------+ | |conversion| | | | - | Optional | | +----------+ +-------+ | +-------+ -->| high-pass |--+ +-->| Range | - + filter + | +------------+ +-------+ |encoder|----> - +-----------+ | | Delay | | CELT | +-->| | bit- - +->|compensation|->|encoder|--+ +-------+ stream - | | | | - +------------+ +-------+ + +------------+ +---------+ + | Sample | | SILK |------+ + +->| Rate |--->| Encoder | V + +-----------+ | | Conversion | | | +---------+ + | Optional | | +------------+ +---------+ | Range | +->| High-pass |--+ | Encoder |----> + + Filter + | +--------------+ +---------+ | | Bit- + +-----------+ | | Delay | | CELT | +---------+ stream + +->| Compensation |->| Encoder | ^ + | | | |------+ + +--------------+ +---------+ ]]>
@@ -5813,7 +5869,7 @@ In the reference implementation, the frame size is selected by the application, other configuration parameters (number of channels, bandwidth, mode) are automatically selected (unless explicitly overridden by the application) depend on the following: -Requested bit-rate +Requested bitrate Input sampling rate Type of signal (speech vs music) Frame size in use @@ -5822,150 +5878,277 @@ selected (unless explicitly overridden by the application) depend on the followi The type of signal currently needs to be provided by the application (though it can be changed in real-time). An Opus encoder implementation could also do automatic detection, but since Opus is an interactive codec, such an implementation would likely have to either -delay the signal (for non-interactive application) or delay the mode switching decisions (for +delay the signal (for non-interactive applications) or delay the mode switching decisions (for interactive applications). -When the encoder is configured for voice over IP applications, the input signal is +When the encoder is configured for voice over IP applications, the input signal is filtered by a high-pass filter to remove the lowest part of the spectrum that contains little speech energy and may contain background noise. This is a second order Auto Regressive Moving Average (ARMA) filter with a cut-off frequency around 50 Hz. -In the future, a music detector may also be used to lower the cut-off frequency when the +In the future, a music detector may also be used to lower the cut-off frequency when the input signal is detected to be music rather than speech. -
+
-The range coder also acts as the bit-packer for Opus. It is -used in three different ways, to encode: +The range coder acts as the bit-packer for Opus. +It is used in three different ways: to encode -entropy-coded symbols with a fixed probability model using ec_encode(), (entenc.c) -integers from 0 to 2**M-1 using ec_enc_uint() or ec_enc_bits(), (entenc.c) -integers from 0 to N-1 (where N is not a power of two) using ec_enc_uint(). (entenc.c) + +Entropy-coded symbols with a fixed probability model using ec_encode() + (entenc.c), + + +Integers from 0 to (2**M - 1) using ec_enc_uint() or ec_enc_bits() + (entenc.c), + +Integers from 0 to (ft - 1) (where ft is not a power of two) using + ec_enc_uint() (entenc.c). + -The range encoder maintains an internal state vector composed of the -four-tuple (low,rng,rem,ext) representing the low end of the current -range, the size of the current range, a single buffered output octet, -and a count of additional carry-propagating output octets. Both rng -and low are 32-bit unsigned integer values, rem is an octet value or -the special value -1, and ext is an integer with at least 16 bits. -This state vector is initialized at the start of each each frame to -the value (0,2**31,-1,0). The reference implementation re-uses the -'val' field of the entropy coder structure to hold low, in order to -allow the same structure to be used for encoding and decoding, but -we maintain the distinction here for clarity. +The range encoder maintains an internal state vector composed of the four-tuple + (val, rng, rem, ext) representing the low end of the current + range, the size of the current range, a single buffered output octet, and a + count of additional carry-propagating output octets. +Both val and rng are 32-bit unsigned integer values, rem is an octet value or + less than 255 or the special value -1, and ext is an unsigned integer with at + least 11 bits. +This state vector is initialized at the start of each each frame to the value + (0, 2**31, -1, 0). +After encoding a sequence of symbols, the value of rng in the encoder should + exactly match the value of rng in the decoder after decoding the same sequence + of symbols. +This is a powerful tool for detecting errors in either an encoder or decoder + implementation. +The value of val, on the other hand, represents different things in the encoder + and decoder, and is not expected to match. + + + +The decoder has no analog for rem and ext. +These are used to perform carry propagation in the renormalization loop below. +Each iteration of this loop produces 9 bits of output, consisting of 8 data + bits and a carry flag. +The encoder cannot determine the final value of the output octets until it + propagates these carry flags. +Therefore the reference implementation buffers a single non-propagating output + octet (i.e., one less than 255) in rem and keeps a count of additional + propagating (i.e., 255) output octets in ext. +An implementation may choose to use any mathematically equivalent scheme to + perform carry propagation.
- The main encoding function is ec_encode() (entenc.c), - which takes as an argument a three-tuple (fl,fh,ft) - describing the range of the symbol to be encoded in the current - context, with 0 <= fl < fh <= ft <= 65535. The values of this tuple - are derived from the probability model for the symbol. Let f(i) be - the frequency of the i'th symbol in the current context. Then the - three-tuple corresponding to the k'th symbol is given by - +The main encoding function is ec_encode() (entenc.c), which encodes symbol k in + the current context using the same three-tuple (fl[k], fh[k], ft) + as the decoder to describe the range of the symbol (see + ). - ec_encode() updates the state of the encoder as follows. If fl is - greater than zero, then low = low + rng - (rng/ft)*(ft-fl) and - rng = (rng/ft)*(fh-fl). Otherwise, low is unchanged and - rng = rng - (rng/ft)*(fh-fl). The divisions here are exact integer - division. After this update, the range is normalized. +ec_encode() updates the state of the encoder as follows. +If fl[k] is greater than zero, then +
+ +
+Otherwise, val is unchanged and +
+ +
+The divisions here are exact integer division. +
+ +
+ +After this update, the range is normalized using a procedure very similar to + that of , implemented by + ec_enc_normalize() (entenc.c). +The following process is repeated until rng > 2**23. +First, the top 9 bits of val, (val>>23), are sent to the carry buffer, + described in . +Then, the encoder sets +
+ +
+
+ +
- To normalize the range, the following process is repeated until - rng > 2**23. First, the top 9 bits of low, (low>>23), are placed into - a carry buffer. Then, low is set to . This process is carried out by - ec_enc_normalize() (entenc.c). +The function ec_enc_carry_out() (entenc.c) implements carry propagation and + output buffering. +It takes as input a 9-bit value, c, consisting of 8 data bits and an additional + carry bit. +If c is equal to the value 255, then ext is simply incremented, and no other + state updates are performed. +Otherwise, let b = (c>>8) be the carry bit. +Then, + + +If the buffered octet rem contains a value other than -1, the encoder outputs + the octet (rem + b). +Otherwise, if rem is -1, no octet is output. - The 9 bits produced in each iteration of the normalization loop - consist of 8 data bits and a carry flag. The final value of the - output bits is not determined until carry propagation is accounted - for. Therefore the reference implementation buffers a single - (non-propagating) output octet and keeps a count of additional - propagating (0xFF) output octets. An implementation may choose to use - any mathematically equivalent scheme to perform carry propagation. +If ext is non-zero, then the encoder outputs ext octets---all with a value of 0 + if b is set, or 255 if b is unset---and sets ext to 0. + + +rem is set to the 8 data bits: +
+ +
+
+
+
+ +
+ +
- The function ec_enc_carry_out() (entenc.c) performs - this buffering. It takes a 9-bit input value, c, from the normalization: - 8 bits of output and a carry bit. If c is 0xFF, then ext is incremented - and no octets are output. Otherwise, if rem is not the special value - -1, then the octet (rem+(c>>8)) is output. Then ext octets are output - with the value 0 if the carry bit is set, or 0xFF if it is not, and - rem is set to the lower 8 bits of c. After this, ext is set to zero. +The reference implementation uses three additional encoding methods that are + exactly equivalent to the above, but make assumptions and simplifications that + allow for a more efficient implementation. + +
- In the reference implementation, a special version of ec_encode() - called ec_encode_bin() (entenc.c) is defined to - take a two-tuple (fl,ftb), where , but avoids using division. +The first is ec_encode_bin() (entenc.c), defined using the parameter ftb + instead of ft. +It is mathematically equivalent to calling ec_encode() with + ft = (1<<ftb), but avoids using division. + +
+
+ +The next is ec_enc_bit_logp() (entenc.c), which encodes a single binary symbol. +The context is described by a single parameter, logp, which is the absolute + value of the base-2 logarithm of the probability of a "1". +It is mathematically equivalent to calling ec_encode() with the 3-tuple + (fl[k] = 0, fh[k] = (1<<logp) - 1, + ft = (1<<logp)) if k is 0 and with + (fl[k] = (1<<logp) - 1, + fh[k] = ft = (1<<logp)) if k is 1. +The implementation requires no multiplications or divisions.
+
+ +The last is ec_enc_icdf() (entenc.c), which encodes a single binary symbol with + a table-based context of up to 8 bits. +This uses the same icdf table as ec_dec_icdf() from + . +The function is mathematically equivalent to calling ec_encode() with + fl[k] = (1<<ftb) - icdf[k-1] (or 0 if + k == 0), fh[k] = (1<<ftb) - icdf[k], and + ft = (1<<ftb). +This only saves a few arithmetic operations over ec_encode_bin(), but allows + the encoder to use the same icdf tables as the decoder. + +
+ +
+
- The CELT layer also allows directly encoding a series of raw bits, outside - of the range coder, implemented in ec_enc_bits() (entenc.c). - The raw bits are packed at the end of the packet, starting by storing the - least significant bit of the value to be packed in the least significant bit - of the last byte, filling up to the most significant bit in - the last byte, and then continuing in the least significant bit of the - penultimate byte, and so on. - This packing may continue into the last byte output by the range coder, - though the format should render it impossible to overwrite any set bit - produced by the range coder when the procedure in - is followed to finalize the stream. +The raw bits used by the CELT layer are packed at the end of the buffer using + ec_enc_bits() (entenc.c). +Because the raw bits may continue into the last byte output by the range coder + if there is room in the low-order bits, the encoder must be prepared to merge + these values into a single octet. +The procedure in does this in a way that + ensures both the range coded data and the raw bits can be decoded + successfully.
- The function ec_enc_uint() is based on ec_encode() and encodes one of N - equiprobable symbols, each with a frequency of 1, where N may be as large as - 2**32-1. Because ec_encode() is limited to a total frequency of 2**16-1, this - is done by encoding a series of symbols in smaller contexts. +The function ec_enc_uint() (entenc.c) encodes one of ft equiprobable symbols in + the range 0 to (ft - 1), inclusive, each with a frequency of 1, + where ft may be as large as (2**32 - 1). +Like the decoder (see ), it splits it splits up the + value into a range coded symbol representing up to 8 of the high bits, and, if + necessary, raw bits representing the remainder of the value. + + +ec_enc_uint() takes a two-tuple (t, ft), where t is the value to be + encoded, 0 <= t < ft, and ft is not necessarily a + power of two. +Let ftb = ilog(ft - 1), i.e., the number of bits required + to store (ft - 1) in two's complement notation. +If ftb is 8 or less, then t is encoded directly using ec_encode() with the + three-tuple (t, t + 1, ft). - ec_enc_uint() (entenc.c) takes a two-tuple (fl,ft), - where ft is not necessarily a power of two. Let ftb be the location - of the highest 1 bit in the two's-complement representation of - (ft-1), or -1 if no bits are set. If ftb>8, then the top 8 bits of fl - are encoded using ec_encode() with the three-tuple - (fl>>ftb-8,(fl>>ftb-8)+1,(ft-1>>ftb-8)+1), and the remaining bits - are encoded as raw bits. Otherwise, fl is encoded with ec_encode() directly - using the three-tuple (fl,fl+1,ft). +If ftb is greater than 8, then the top 8 bits of t are encoded using the + three-tuple (t>>(ftb - 8), + (t>>(ftb - 8)) + 1, + ((ft - 1)>>(ftb - 8)) + 1), and the + remaining bits, + (t & ((1<<(ftb - 8)) - 1), + are encoded as raw bits with ec_enc_bits().
- After all symbols are encoded, the stream must be finalized by - outputting a value inside the current range. Let end be the integer - in the interval [low,low+rng) with the largest number of trailing - zero bits, b, such that end+(1<<b)-1 is also in the interval - [low,low+rng). Then while end is not zero, the top 9 bits of end, e.g., - >23), are sent to the carry buffer, and end is replaced by - (end<<8&0x7FFFFFFF). Finally, if the value in carry buffer, rem, is]]> - neither zero nor the special value -1, or the carry count, ext, is - greater than zero, then 9 zero bits are sent to the carry buffer. - After the carry buffer is finished outputting octets, the rest of the - output buffer (if any) is padded with zero bits, until it reaches the raw - bits. Finally, rem is set to the - special value -1. This process is implemented by ec_enc_done() - (entenc.c). +After all symbols are encoded, the stream must be finalized by outputting a + value inside the current range. +Let end be the integer in the interval [val, val + rng) with the + largest number of trailing zero bits, b, such that + (end + (1<<b) - 1) is also in the interval + [val, val + rng). +This choice of end allows the maximum number of trailing bits to be set to + arbitrary values while still ensuring the range coded part of the buffer can + be decoded correctly. +Then, while end is not zero, the top 9 bits of end, i.e., (end>>23), are + passed to the carry buffer in accordance with the procedure in + , and end is updated via +
+ +
+Finally, if the buffered output octet, rem, is neither zero nor the special + value -1, or the carry count, ext, is greater than zero, then 9 zero bits are + sent to the carry buffer to flush it to the output buffer. +When outputting the final byte from the range coder, if it would overlap any + raw bits already packed into the end of the output buffer, they should be ORed + into the same byte. +The bit allocation routines in the CELT layer should ensure that this can be + done without corrupting the range coder data so long as end is chosen as + described above. +If there is any space between the end of the range coder data and the end of + the raw bits, it is padded with zero bits. +This entire process is implemented by ec_enc_done() (entenc.c).
@@ -5989,30 +6172,29 @@ fl=sum(f(i),i - In many respects the SILK encoder mirrors the SILK decoder described - in . - Details such as the quantization and range coder tables can be found - there, while this section describes the high-level design choices that + In many respects the SILK encoder mirrors the SILK decoder described + in . + Details such as the quantization and range coder tables can be found + there, while this section describes the high-level design choices that were made. The diagram below shows the basic modules of the SILK encoder. -
+
| Rate |--->| Mixing |--->| Core |----------> - input |Conversion| | | | Encoder | bitstream - +----------+ +--------+ +---------+ + +----------+ +--------+ +---------+ + | Sample | | Stereo | | SILK | +------>| Rate |--->| Mixing |--->| Core |----------> +Input |Conversion| | | | Encoder | Bitstream + +----------+ +--------+ +---------+ ]]> -Silk Encoder.
The input signal's sampling rate is adjusted by a sample rate conversion -module so that it matches the SILK internal sampling rate. +module so that it matches the SILK internal sampling rate. The input to the sample rate converter is delayed by a number of samples depending on the sample rate ratio, such that the overall delay is constant for all input and output sample rates. @@ -6026,17 +6208,17 @@ It converts a stereo left/right signal into an adaptive mid/side representation. The first step is to compute non-adaptive mid/side signals as half the sum and difference between left and right signals. -The side signal is then minimized in energy by subtracting a +The side signal is then minimized in energy by subtracting a prediction of it based on the mid signal. This prediction works well when the left and right signals exhibit linear dependency, for instance for an amplitude-panned input signal. Like in the decoder, the prediction coefficients are linearly interpolated during the first 8 ms of the frame. - The mid signal is always encoded, whereas the residual + The mid signal is always encoded, whereas the residual side signal is only encoded if it has sufficient - energy compared to the mid signal's energy. - If it has not, + energy compared to the mid signal's energy. + If it has not, the "mid_only_flag" is set without encoding the side signal. @@ -6045,13 +6227,13 @@ the side signal is encoded. For each frame, two predictor coefficients are computed, one that predicts between low-passed mid and side channels, and one that predicts between high-passed mid and side channels. -The low-pass filter is a simple three-tap filter +The low-pass filter is a simple three-tap filter and creates a delay of one sample. The high-pass filtered signal is the difference between the mid signal delayed by one sample and the low-passed signal. Instead of explicitly computing the high-passed signal, it is computationally more efficient to transform -the prediction coefficients before applying them to the +the prediction coefficients before applying them to the filtered mid signal, as follows
@@ -6077,7 +6259,7 @@ For simplicity, the core encoder is referred to simply as the encoder in the remainder of this section. An overview of the encoder is given in . -
+
-Silk Core Encoder.
-The input signal is processed by a Voice Activity Detector (VAD) to produce -a measure of voice activity, spectral tilt, and signal-to-noise estimates for -each frame. The VAD uses a sequence of half-band filterbanks to split the -signal into four subbands: 0...Fs/16, Fs/16...Fs/8, Fs/8...Fs/4, and -Fs/4...Fs/2, where Fs is the sampling frequency (8, 12, 16, or 24 kHz). -The lowest subband, from 0 - Fs/16, is high-pass filtered with a first-order -moving average (MA) filter (with transfer function H(z) = 1-z**(-1)) to -reduce the energy at the lowest frequencies. For each frame, the signal -energy per subband is computed. -In each subband, a noise level estimator tracks the background noise level -and a Signal-to-Noise Ratio (SNR) value is computed as the logarithm of the -ratio of energy to noise level. -Using these intermediate variables, the following parameters are calculated +The input signal is processed by a Voice Activity Detector (VAD) to produce +a measure of voice activity, spectral tilt, and signal-to-noise estimates for +each frame. The VAD uses a sequence of half-band filterbanks to split the +signal into four subbands: 0...Fs/16, Fs/16...Fs/8, Fs/8...Fs/4, and +Fs/4...Fs/2, where Fs is the sampling frequency (8, 12, 16, or 24 kHz). +The lowest subband, from 0 - Fs/16, is high-pass filtered with a first-order +moving average (MA) filter (with transfer function H(z) = 1-z**(-1)) to +reduce the energy at the lowest frequencies. For each frame, the signal +energy per subband is computed. +In each subband, a noise level estimator tracks the background noise level +and a Signal-to-Noise Ratio (SNR) value is computed as the logarithm of the +ratio of energy to noise level. +Using these intermediate variables, the following parameters are calculated for use in other SILK modules: @@ -6165,12 +6346,12 @@ Smoothed subband SNRs. Temporally smoothed subband SNR values. -Speech activity level. Based on the average SNR and a weighted average of the +Speech activity level. Based on the average SNR and a weighted average of the subband energies. -Spectral tilt. A weighted average of the subband SNRs, with positive weights +Spectral tilt. A weighted average of the subband SNRs, with positive weights for the low subbands and negative weights for the high subbands. @@ -6179,9 +6360,10 @@ for the low subbands and negative weights for the high subbands.
-The input signal is processed by the open loop pitch estimator shown in +The input signal is processed by the open loop pitch estimator shown in . -
+
-Block diagram of the pitch estimator.
-The pitch analysis finds a binary voiced/unvoiced classification, and, for -frames classified as voiced, four pitch lags per frame - one for each -5 ms subframe - and a pitch correlation indicating the periodicity of -the signal. -The input is first whitened using a Linear Prediction (LP) whitening filter, -where the coefficients are computed through standard Linear Prediction Coding -(LPC) analysis. The order of the whitening filter is 16 for best results, but -is reduced to 12 for medium complexity and 8 for low complexity modes. -The whitened signal is analyzed to find pitch lags for which the time -correlation is high. +The pitch analysis finds a binary voiced/unvoiced classification, and, for +frames classified as voiced, four pitch lags per frame - one for each +5 ms subframe - and a pitch correlation indicating the periodicity of +the signal. +The input is first whitened using a Linear Prediction (LP) whitening filter, +where the coefficients are computed through standard Linear Prediction Coding +(LPC) analysis. The order of the whitening filter is 16 for best results, but +is reduced to 12 for medium complexity and 8 for low complexity modes. +The whitened signal is analyzed to find pitch lags for which the time +correlation is high. The analysis consists of three stages for reducing the complexity: -In the first stage, the whitened signal is downsampled to 4 kHz -(from 8 kHz) and the current frame is correlated to a signal delayed -by a range of lags, starting from a shortest lag corresponding to +In the first stage, the whitened signal is downsampled to 4 kHz +(from 8 kHz) and the current frame is correlated to a signal delayed +by a range of lags, starting from a shortest lag corresponding to 500 Hz, to a longest lag corresponding to 56 Hz. -The second stage operates on an 8 kHz signal (downsampled from 12, 16, -or 24 kHz) and measures time correlations only near the lags -corresponding to those that had sufficiently high correlations in the first -stage. The resulting correlations are adjusted for a small bias towards -short lags to avoid ending up with a multiple of the true pitch lag. +The second stage operates on an 8 kHz signal (downsampled from 12, 16, +or 24 kHz) and measures time correlations only near the lags +corresponding to those that had sufficiently high correlations in the first +stage. The resulting correlations are adjusted for a small bias towards +short lags to avoid ending up with a multiple of the true pitch lag. The highest adjusted correlation is compared to a threshold depending on: @@ -6250,13 +6431,13 @@ The speech activity level The spectral tilt. -If the threshold is exceeded, the current frame is classified as voiced and -the lag with the highest adjusted correlation is stored for a final pitch +If the threshold is exceeded, the current frame is classified as voiced and +the lag with the highest adjusted correlation is stored for a final pitch analysis of the highest precision in the third stage. -The last stage operates directly on the whitened input signal to compute time -correlations for each of the four subframes independently in a narrow range +The last stage operates directly on the whitened input signal to compute time +correlations for each of the four subframes independently in a narrow range around the lag with highest correlation from the second stage. @@ -6265,44 +6446,45 @@ around the lag with highest correlation from the second stage.
-The noise shaping analysis finds gains and filter coefficients used in the -prefilter and noise shaping quantizer. These parameters are chosen such that +The noise shaping analysis finds gains and filter coefficients used in the +prefilter and noise shaping quantizer. These parameters are chosen such that they will fulfill several requirements: -Balancing quantization noise and bitrate. -The quantization gains determine the step size between reconstruction levels -of the excitation signal. Therefore, increasing the quantization gain -amplifies quantization noise, but also reduces the bitrate by lowering +Balancing quantization noise and bitrate. +The quantization gains determine the step size between reconstruction levels +of the excitation signal. Therefore, increasing the quantization gain +amplifies quantization noise, but also reduces the bitrate by lowering the entropy of the quantization indices. -Spectral shaping of the quantization noise; the noise shaping quantizer is -capable of reducing quantization noise in some parts of the spectrum at the -cost of increased noise in other parts without substantially changing the -bitrate. -By shaping the noise such that it follows the signal spectrum, it becomes -less audible. In practice, best results are obtained by making the shape +Spectral shaping of the quantization noise; the noise shaping quantizer is +capable of reducing quantization noise in some parts of the spectrum at the +cost of increased noise in other parts without substantially changing the +bitrate. +By shaping the noise such that it follows the signal spectrum, it becomes +less audible. In practice, best results are obtained by making the shape of the noise spectrum slightly flatter than the signal spectrum. -De-emphasizing spectral valleys; by using different coefficients in the -analysis and synthesis part of the prefilter and noise shaping quantizer, -the levels of the spectral valleys can be decreased relative to the levels -of the spectral peaks such as speech formants and harmonics. -This reduces the entropy of the signal, which is the difference between the +De-emphasizing spectral valleys; by using different coefficients in the +analysis and synthesis part of the prefilter and noise shaping quantizer, +the levels of the spectral valleys can be decreased relative to the levels +of the spectral peaks such as speech formants and harmonics. +This reduces the entropy of the signal, which is the difference between the coded signal and the quantization noise, thus lowering the bitrate. -Matching the levels of the decoded speech formants to the levels of the -original speech formants; an adjustment gain and a first order tilt -coefficient are computed to compensate for the effect of the noise +Matching the levels of the decoded speech formants to the levels of the +original speech formants; an adjustment gain and a first order tilt +coefficient are computed to compensate for the effect of the noise shaping quantization on the level and spectral tilt. -
+
-Noise shaping and spectral de-emphasis illustration.
- shows an example of an -input signal spectrum (1). -After de-emphasis and level matching, the spectrum has deeper valleys (2). -The quantization noise spectrum (3) more or less follows the input signal -spectrum, while having slightly less pronounced peaks. -The entropy, which provides a lower bound on the bitrate for encoding the -excitation signal, is proportional to the area between the de-emphasized -spectrum (2) and the quantization noise spectrum (3). Without de-emphasis, -the entropy is proportional to the area between input spectrum (1) and + shows an example of an +input signal spectrum (1). +After de-emphasis and level matching, the spectrum has deeper valleys (2). +The quantization noise spectrum (3) more or less follows the input signal +spectrum, while having slightly less pronounced peaks. +The entropy, which provides a lower bound on the bitrate for encoding the +excitation signal, is proportional to the area between the de-emphasized +spectrum (2) and the quantization noise spectrum (3). Without de-emphasis, +the entropy is proportional to the area between input spectrum (1) and quantization noise (3) - clearly higher. -The transformation from input signal to de-emphasized signal can be +The transformation from input signal to de-emphasized signal can be described as a filtering operation with a filter
@@ -6365,9 +6546,9 @@ Wana(z) = (1 - \ (a_ana(k) * z )*(1 - z * \ b_ana(k) * z ), ]]>
-is the analysis part of the de-emphasis filter, consisting of the short-term -shaping filter with coefficients a_ana(k), and the long-term shaping filter -with coefficients b_ana(k) and pitch lag L. +is the analysis part of the de-emphasis filter, consisting of the short-term +shaping filter with coefficients a_ana(k), and the long-term shaping filter +with coefficients b_ana(k) and pitch lag L. The parameter d determines the number of long-term shaping filter taps.
@@ -6386,19 +6567,19 @@ Wsyn(z) = (1 - \ (a_syn(k) * z )*(1 - z * \ b_syn(k) * z ).
-All noise shaping parameters are computed and applied per subframe of 5 ms. -First, an LPC analysis is performed on a windowed signal block of 15 ms. -The signal block has a look-ahead of 5 ms relative to the current subframe, -and the window is an asymmetric sine window. The LPC analysis is done with the +All noise shaping parameters are computed and applied per subframe of 5 ms. +First, an LPC analysis is performed on a windowed signal block of 15 ms. +The signal block has a look-ahead of 5 ms relative to the current subframe, +and the window is an asymmetric sine window. The LPC analysis is done with the autocorrelation method, with an order of between 8, in lowest-complexity mode, -and 16, for best quality. +and 16, for best quality. Optionally the LPC analysis and noise shaping filters are warped by replacing the delay elements by first-order allpass filters. -This increases the frequency resolution at low frequencies and reduces it at +This increases the frequency resolution at low frequencies and reduces it at high ones, which better matches the human auditory system and improves -quality. +quality. The warped analysis and filtering comes at a cost in complexity and is therefore only done in higher complexity modes. @@ -6408,10 +6589,10 @@ from the LPC analysis and multiplying it by a value inversely proportional to the coding quality control parameter and the pitch correlation. -Next the two sets of short-term noise shaping coefficients a_ana(k) and -a_syn(k) are obtained by applying different amounts of bandwidth expansion to the -coefficients found in the LPC analysis. -This bandwidth expansion moves the roots of the LPC polynomial towards the +Next the two sets of short-term noise shaping coefficients a_ana(k) and +a_syn(k) are obtained by applying different amounts of bandwidth expansion to the +coefficients found in the LPC analysis. +This bandwidth expansion moves the roots of the LPC polynomial towards the origin, using the formulas
@@ -6424,7 +6605,7 @@ origin, using the formulas ]]>
-where a(k) is the k'th LPC coefficient, and the bandwidth expansion factors +where a(k) is the k'th LPC coefficient, and the bandwidth expansion factors g_ana and g_syn are calculated as
@@ -6435,13 +6616,13 @@ g_syn = 0.95 + 0.01*C, ]]>
-where C is the coding quality control parameter between 0 and 1. -Applying more bandwidth expansion to the analysis part than to the synthesis +where C is the coding quality control parameter between 0 and 1. +Applying more bandwidth expansion to the analysis part than to the synthesis part gives the desired de-emphasis of spectral valleys in between formants.
-The long-term shaping is applied only during voiced frames. +The long-term shaping is applied only during voiced frames. It uses three filter taps, described by
@@ -6452,11 +6633,11 @@ b_syn = F_syn * [0.25, 0.5, 0.25]. ]]>
-For unvoiced frames these coefficients are set to 0. The multiplication factors -F_ana and F_syn are chosen between 0 and 1, depending on the coding quality -control parameter, as well as the calculated pitch correlation and smoothed -subband SNR of the lowest subband. By having F_ana less than F_syn, -the pitch harmonics are emphasized relative to the valleys in between the +For unvoiced frames these coefficients are set to 0. The multiplication factors +F_ana and F_syn are chosen between 0 and 1, depending on the coding quality +control parameter, as well as the calculated pitch correlation and smoothed +subband SNR of the lowest subband. By having F_ana less than F_syn, +the pitch harmonics are emphasized relative to the valleys in between the harmonics.
@@ -6465,7 +6646,7 @@ The tilt coefficient c_tilt is for unvoiced frames chosen as
@@ -6480,15 +6661,15 @@ c_tilt = 0.25 + 0.2625 * V for voiced frames, where V is the voice activity level between 0 and 1. -The adjustment gain G serves to correct any level mismatch between the original -and decoded signals that might arise from the noise shaping and de-emphasis. -This gain is computed as the ratio of the prediction gain of the short-term -analysis and synthesis filter coefficients. The prediction gain of an LPC -synthesis filter is the square root of the output energy when the filter is -excited by a unit-energy impulse on the input. -An efficient way to compute the prediction gain is by first computing the -reflection coefficients from the LPC coefficients through the step-down -algorithm, and extracting the prediction gain from the reflection coefficients +The adjustment gain G serves to correct any level mismatch between the original +and decoded signals that might arise from the noise shaping and de-emphasis. +This gain is computed as the ratio of the prediction gain of the short-term +analysis and synthesis filter coefficients. The prediction gain of an LPC +synthesis filter is the square root of the output energy when the filter is +excited by a unit-energy impulse on the input. +An efficient way to compute the prediction gain is by first computing the +reflection coefficients from the LPC coefficients through the step-down +algorithm, and extracting the prediction gain from the reflection coefficients as
@@ -6504,22 +6685,22 @@ where r_k is the k'th reflection coefficient. -Initial values for the quantization gains are computed as the square-root of -the residual energy of the LPC analysis, adjusted by the coding quality control -parameter. -These quantization gains are later adjusted based on the results of the +Initial values for the quantization gains are computed as the square-root of +the residual energy of the LPC analysis, adjusted by the coding quality control +parameter. +These quantization gains are later adjusted based on the results of the prediction analysis.
-The prediction analysis is performed in one of two ways depending on how -the pitch estimator classified the frame. -The processing for voiced and unvoiced speech is described in - and - , respectively. - Inputs to this function include the pre-whitened signal from the +The prediction analysis is performed in one of two ways depending on how +the pitch estimator classified the frame. +The processing for voiced and unvoiced speech is described in + and + , respectively. + Inputs to this function include the pre-whitened signal from the pitch estimator (see ). @@ -6538,58 +6719,58 @@ The processing for voiced and unvoiced speech is described in This LTP residual signal is the input to an LPC analysis where the LPCs are estimated using Burg's method, such that the residual energy is minimized. The estimated LPCs are converted to a Line Spectral Frequency (LSF) vector - and quantized as described in . -After quantization, the quantized LSF vector is converted back to LPC -coefficients using the full procedure in . -By using quantized LTP coefficients and LPC coefficients derived from the -quantized LSF coefficients, the encoder remains fully synchronized with the -decoder. -The quantized LPC and LTP coefficients are also used to filter the input + and quantized as described in . +After quantization, the quantized LSF vector is converted back to LPC +coefficients using the full procedure in . +By using quantized LTP coefficients and LPC coefficients derived from the +quantized LSF coefficients, the encoder remains fully synchronized with the +decoder. +The quantized LPC and LTP coefficients are also used to filter the input signal and measure residual energy for each of the four subframes.
-For a speech signal that has been classified as unvoiced, there is no need -for LTP filtering, as it has already been determined that the pre-whitened -input signal is not periodic enough within the allowed pitch period range -for LTP analysis to be worth the cost in terms of complexity and bitrate. -The pre-whitened input signal is therefore discarded, and instead the input -signal is used for LPC analysis using Burg's method. -The resulting LPC coefficients are converted to an LSF vector and quantized -as described in the following section. -They are then transformed back to obtain quantized LPC coefficients, which -are then used to filter the input signal and measure residual energy for +For a speech signal that has been classified as unvoiced, there is no need +for LTP filtering, as it has already been determined that the pre-whitened +input signal is not periodic enough within the allowed pitch period range +for LTP analysis to be worth the cost in terms of complexity and bitrate. +The pre-whitened input signal is therefore discarded, and instead the input +signal is used for LPC analysis using Burg's method. +The resulting LPC coefficients are converted to an LSF vector and quantized +as described in the following section. +They are then transformed back to obtain quantized LPC coefficients, which +are then used to filter the input signal and measure residual energy for each of the four subframes.
The main purpose of LPC coding in SILK is to reduce the bitrate by minimizing the residual energy. -At least at high bitrates, perceptual aspects are handled +At least at high bitrates, perceptual aspects are handled independently by the noise shaping filter. Burg's method is used because it provides higher prediction gain than the autocorrelation method and, unlike the covariance method, produces stable filters (assuming numerical errors don't spoil -that). SILK's implementation of Burg's method is also computationally +that). SILK's implementation of Burg's method is also computationally faster than the autocovariance method. -The implementation of Burg's method differs from traditional +The implementation of Burg's method differs from traditional implementations in two aspects. -The first difference is that it -operates on autocorrelations, similar to the Schur algorithm, but +The first difference is that it +operates on autocorrelations, similar to the Schur algorithm, but with a simple update to the autocorrelations after finding each reflection coefficient to make the result identical to Burg's method. -This brings down the complexity of Burg's method to near that of +This brings down the complexity of Burg's method to near that of the autocorrelation method. The second difference is that the signal in each subframe is scaled -by the inverse of the residual quantization step size. Subframes with -a small quantization step size will on average spend more bits for a -given amount of residual energy than subframes with a large step size. -Without scaling, Burg's method minimizes the total residual energy in -all subframes, which doesn't necessarily minimize the total number of -bits needed for coding the quantized residual. The residual energy +by the inverse of the residual quantization step size. Subframes with +a small quantization step size will on average spend more bits for a +given amount of residual energy than subframes with a large step size. +Without scaling, Burg's method minimizes the total residual energy in +all subframes, which doesn't necessarily minimize the total number of +bits needed for coding the quantized residual. The residual energy of the scaled subframes is a better measure for that number of -bits. +bits.
@@ -6597,14 +6778,14 @@ bits.
-Unlike many other speech codecs, SILK uses variable bitrate coding +Unlike many other speech codecs, SILK uses variable bitrate coding for the LSFs. This improves the average rate-distortion tradeoff and reduces outliers. The variable bitrate coding minimizes a linear combination of the weighted quantization errors and the bitrate. The weights for the quantization errors are the Inverse Harmonic Mean Weighting (IHMW) function proposed by Laroia et al. -(see ). +(see ). These weights are referred to here as Laroia weights. @@ -6612,7 +6793,7 @@ The LSF quantizer consists of two stages. The first stage is an (unweighted) vector quantizer (VQ), with a codebook size of 32 vectors. The quantization errors for the codebook vector are sorted, and -for the N best vectors a second stage quantizer is run. +for the N best vectors a second stage quantizer is run. By varying the number N a tradeoff is made between R/D performance and computational efficiency. For each of the N codebook vectors the Laroia weights corresponding @@ -6622,7 +6803,7 @@ vector is scaled by the square roots of these Laroia weights. This scaling partially normalizes error sensitivity for the residual vector, so that a uniform quantizer with fixed step sizes can be used in the second stage without too much -performance loss. +performance loss. And by scaling with Laroia weights determined from the first-stage codebook vector, the process can be reversed in the decoder. @@ -6651,38 +6832,37 @@ better in the reverse direction. The quantization index of the first stage is entropy coded. The quantization sequence from the second stage is also entropy coded, where for each element the probability table is chosen -depending on the vector index from the first and the location +depending on the vector index from the first stage and the location of that element in the LSF vector. - +
-If the input is stable, finding the best candidate usually results in a -quantized vector that is also stable. Because of the two-stage approach, -however, it is possible that the best quantization candidate is unstable. -Therefore we apply an LSF stabilization method which ensures that the LSF -parameters are within their valid range, increasingly sorted, and have minimum -distances between each other and the border values that have been -predetermined as the 0.01 percentile distance values from a large -training set. +If the input is stable, finding the best candidate usually results in a +quantized vector that is also stable. Because of the two-stage approach, +however, it is possible that the best quantization candidate is unstable. +The encoder applies the same stabilization procedure applied by the decoder + (see to ensure the LSF parameters + are within their valid range, increasingly sorted, and have minimum + distances between each other and the border values.
-For voiced frames, the prediction analysis described in - resulted in four sets -(one set per subframe) of five LTP coefficients, plus four weighting matrices. -The LTP coefficients for each subframe are quantized using entropy constrained -vector quantization. -A total of three vector codebooks are available for quantization, with -different rate-distortion trade-offs. The three codebooks have 10, 20, and -40 vectors and average rates of about 3, 4, and 5 bits per vector, respectively. -Consequently, the first codebook has larger average quantization distortion at -a lower rate, whereas the last codebook has smaller average quantization -distortion at a higher rate. -Given the weighting matrix W_ltp and LTP vector b, the weighted rate-distortion +For voiced frames, the prediction analysis described in + resulted in four sets +(one set per subframe) of five LTP coefficients, plus four weighting matrices. +The LTP coefficients for each subframe are quantized using entropy constrained +vector quantization. +A total of three vector codebooks are available for quantization, with +different rate-distortion trade-offs. The three codebooks have 10, 20, and +40 vectors and average rates of about 3, 4, and 5 bits per vector, respectively. +Consequently, the first codebook has larger average quantization distortion at +a lower rate, whereas the last codebook has smaller average quantization +distortion at a higher rate. +Given the weighting matrix W_ltp and LTP vector b, the weighted rate-distortion measure for a codebook vector cb_i with rate r_i is give by
@@ -6691,35 +6871,35 @@ measure for a codebook vector cb_i with rate r_i is give by ]]>
-where u is a fixed, heuristically-determined parameter balancing the distortion -and rate. -Which codebook gives the best performance for a given LTP vector depends on the -weighting matrix for that LTP vector. -For example, for a low valued W_ltp, it is advantageous to use the codebook -with 10 vectors as it has a lower average rate. -For a large W_ltp, on the other hand, it is often better to use the codebook +where u is a fixed, heuristically-determined parameter balancing the distortion +and rate. +Which codebook gives the best performance for a given LTP vector depends on the +weighting matrix for that LTP vector. +For example, for a low valued W_ltp, it is advantageous to use the codebook +with 10 vectors as it has a lower average rate. +For a large W_ltp, on the other hand, it is often better to use the codebook with 40 vectors, as it is more likely to contain the best codebook vector. -The weighting matrix W_ltp depends mostly on two aspects of the input signal. -The first is the periodicity of the signal; the more periodic, the larger W_ltp. -The second is the change in signal energy in the current subframe, relative to -the signal one pitch lag earlier. -A decaying energy leads to a larger W_ltp than an increasing energy. -Both aspects fluctuate relatively slowly, which causes the W_ltp matrices for -different subframes of one frame often to be similar. -Because of this, one of the three codebooks typically gives good performance -for all subframes, and therefore the codebook search for the subframe LTP -vectors is constrained to only allow codebook vectors to be chosen from the +The weighting matrix W_ltp depends mostly on two aspects of the input signal. +The first is the periodicity of the signal; the more periodic, the larger W_ltp. +The second is the change in signal energy in the current subframe, relative to +the signal one pitch lag earlier. +A decaying energy leads to a larger W_ltp than an increasing energy. +Both aspects fluctuate relatively slowly, which causes the W_ltp matrices for +different subframes of one frame often to be similar. +Because of this, one of the three codebooks typically gives good performance +for all subframes, and therefore the codebook search for the subframe LTP +vectors is constrained to only allow codebook vectors to be chosen from the same codebook, resulting in a rate reduction.
-To find the best codebook, each of the three vector codebooks is -used to quantize all subframe LTP vectors and produce a combined -weighted rate-distortion measure for each vector codebook. -The vector codebook with the lowest combined rate-distortion -over all subframes is chosen. The quantized LTP vectors are used -in the noise shaping quantizer, and the index of the codebook -plus the four indices for the four subframe codebook vectors +To find the best codebook, each of the three vector codebooks is +used to quantize all subframe LTP vectors and produce a combined +weighted rate-distortion measure for each vector codebook. +The vector codebook with the lowest combined rate-distortion +over all subframes is chosen. The quantized LTP vectors are used +in the noise shaping quantizer, and the index of the codebook +plus the four indices for the four subframe codebook vectors are passed on to the range encoder.
@@ -6733,33 +6913,33 @@ By applying only the noise shaping analysis filter to the input signal, it provides the input to the noise shaping quantizer.
- +
-The noise shaping quantizer independently shapes the signal and coding noise +The noise shaping quantizer independently shapes the signal and coding noise spectra to obtain a perceptually higher quality at the same bitrate. -The prefilter output signal is multiplied with a compensation gain G computed -in the noise shaping analysis. Then the output of a synthesis shaping filter -is added, and the output of a prediction filter is subtracted to create a -residual signal. -The residual signal is multiplied by the inverse quantized quantization gain -from the noise shaping analysis, and input to a scalar quantizer. -The quantization indices of the scalar quantizer represent a signal of pulses -that is input to the pyramid range encoder. -The scalar quantizer also outputs a quantization signal, which is multiplied -by the quantized quantization gain from the noise shaping analysis to create -an excitation signal. -The output of the prediction filter is added to the excitation signal to form -the quantized output signal y(n). -The quantized output signal y(n) is input to the synthesis shaping and +The prefilter output signal is multiplied with a compensation gain G computed +in the noise shaping analysis. Then the output of a synthesis shaping filter +is added, and the output of a prediction filter is subtracted to create a +residual signal. +The residual signal is multiplied by the inverse quantized quantization gain +from the noise shaping analysis, and input to a scalar quantizer. +The quantization indices of the scalar quantizer represent a signal of pulses +that is input to the pyramid range encoder. +The scalar quantizer also outputs a quantization signal, which is multiplied +by the quantized quantization gain from the noise shaping analysis to create +an excitation signal. +The output of the prediction filter is added to the excitation signal to form +the quantized output signal y(n). +The quantized output signal y(n) is input to the synthesis shaping and prediction filters. Optionally the noise shaping quantizer operates in a delayed decision -mode. -In this mode it uses a Viterbi algorithm to keep track of +mode. +In this mode it uses a Viterbi algorithm to keep track of multiple rounding choices in the quantizer and select the best one after a delay of 32 samples. This improves the rate/distortion performance of the quantizer. @@ -6774,14 +6954,12 @@ performance of the quantizer. no more than the allowed number of bits. The Opus wrapper code then pads the bitstream if any unused bits are left in SILK mode, or encodes the high band with the remaining number of bits in Hybrid mode. - If SILK is unable to encode the packet with less than the allowed number - of bits, the Opus encoder temporarily codes the signal in CELT mode instead. The number of payload bits is adjusted by changing the quantization gains and the rate/distortion tradeoff in the noise - shaping quantizer, in an iterateve loop + shaping quantizer, in an iterative loop around the noise shaping quantizer and entropy coding. - Compared to the SILK VBR mode, the CBR mode has lower - audio quality at a given average bitrate, and also has higher + Compared to the SILK VBR mode, the CBR mode has lower + audio quality at a given average bitrate, and also has higher computational complexity.
@@ -6793,23 +6971,23 @@ performance of the quantizer.
-Most of the aspects of the CELT encoder can be directly derived from the description +Most of the aspects of the CELT encoder can be directly derived from the description of the decoder. For example, the filters and rotations in the encoder are simply the inverse of the operation performed by the decoder. Similarly, the quantizers generally optimize for the mean square error (because noise shaping is part of the bit-stream itself), -so no special search is required. For this reason, only the less straightforward aspects of the +so no special search is required. For this reason, only the less straightforward aspects of the encoder are described here.
-The pitch prefilter is applied after the pre-emphasis. It is applied +The pitch prefilter is applied after the pre-emphasis. It is applied in such a way as to be the inverse of the decoder's post-filter. The main non-obvious aspect of the -prefilter is the selection of the pitch period. The pitch search should be optimised for the +prefilter is the selection of the pitch period. The pitch search should be optimized for the following criteria: continuity: it is important that the pitch period does not change abruptly between frames; and -avoidance of pitch multiples: when the period used is a multiple of the real period +avoidance of pitch multiples: when the period used is a multiple of the real period (lower frequency fundamental), the post-filter loses most of its ability to reduce noise @@ -6831,41 +7009,41 @@ and normalise_bands() (bands.c), respectively. Energy quantization (both coarse and fine) can be easily understood from the decoding process. -For all useful bitrates, the coarse quantizer always chooses the quantized log energy value that +For all useful bitrates, the coarse quantizer always chooses the quantized log energy value that minimizes the error for each band. Only at very low rate does the encoder allow larger errors to minimize the rate and avoid using more bits than are available. When the available CPU requirements allow it, it is best to try encoding the coarse energy both with and without inter-frame prediction such that the best prediction mode can be selected. The optimal mode depends on -the coding rate, the available bit-rate, and the current rate of packet loss. +the coding rate, the available bitrate, and the current rate of packet loss. -The fine energy quantizer always chooses the quantized log energy value that +The fine energy quantizer always chooses the quantized log energy value that minimizes the error for each band because the rate of the fine quantization depends only -on the bit allocation and not on the values that are coded. +on the bit allocation and not on the values that are coded.
-
+
The encoder must use exactly the same bit allocation process as used by the decoder and described in . The three mechanisms that can be used by the -encoder to adjust the bit-rate on a frame-by-frame basis are band boost, allocation trim, +encoder to adjust the bitrate on a frame-by-frame basis are band boost, allocation trim, and band skipping. -
+
The reference encoder makes a decision to boost a band when the energy of that band is significantly higher than that of the neighboring bands. Let E_j be the log-energy of band j, we define D_j = 2*E_j - E_j-1 - E_j+1 -The allocation of band j is boosted once if D_j > t1 and twice if D_j > t2. For LM>=1, t1=2 and t2=4, +The allocation of band j is boosted once if D_j > t1 and twice if D_j > t2. For LM>=1, t1=2 and t2=4, while for LM<1, t1=3 and t2=5.
-
+
The allocation trim is a value between 0 and 10 (inclusively) that controls the allocation balance between the low and high frequencies. The encoder starts with a safe "default" of 5 and deviates from that default in two different ways. First the trim can deviate by +/- 2 @@ -6877,7 +7055,7 @@ be decreased by up to 4 when the inter-channel correlation at low frequency (fir is high.
-
+
The encoder uses band skipping to ensure that the shape of the bands is only coded if there is at least 1/2 bit per sample available for the PVQ. If not, then no bit is allocated and folding is used instead. To ensure continuity in the allocation, some amount of hysteresis is @@ -6888,7 +7066,7 @@ previous frames needs at least 9/16 bit/sample to be coded.
-
+
Because CELT applies mid-side stereo coupling in the normalized domain, it does not suffer from important stereo image problems even when the two channels are completely uncorrelated. For this reason it is always safe to use stereo coupling on any audio frame. That being said, there are some frames @@ -6913,7 +7091,8 @@ taking into account the frame size by subtracting 80 bits per frame for coarse e band using intensity coding is as follows: - + bitrate (kb/s) start band <35 8 @@ -6923,7 +7102,6 @@ band using intensity coding is as follows: 84-102 19 102-130 20 >130 disabled -Thresholds for intensity stereo @@ -6946,7 +7124,7 @@ See tf_analysis() in celt/celt.c. The choice of the spreading value in has an impact on the nature of the coding noise introduced by CELT. The larger the f_r value, the lower the impact of the rotation, and the more tonal the coding noise. The -more tonal the signal, the more tonal the noise should be, so the CELT encoder determines +more tonal the signal, the more tonal the noise should be, so the CELT encoder determines the optimal value for f_r by estimating how tonal the signal is. The tonality estimate is based on discrete pdf (4-bin histogram) of each band. Bands that have a large number of small values are considered more tonal and a decision is made by combining all bands with more than @@ -6964,7 +7142,7 @@ all integer codevectors y of N dimensions that satisfy sum(abs(y(j))) = K. -In bands where there are sufficient bits allocated the PVQ is used to encode +In bands where there are sufficient bits allocated PVQ is used to encode the unit vector that results from the normalization in directly. Given a PVQ codevector y, the unit vector X is obtained as X = y/||y||, where ||.|| denotes the @@ -7017,11 +7195,11 @@ codebook and the implementers MAY use any other search methods. See alg_quant()
-
+
-It is the intention to allow the greatest possible choice of freedom in -implementing the specification. For this reason, outside of a few exceptions +It is our intention to allow the greatest possible choice of freedom in +implementing the specification. For this reason, outside of the exceptions noted in this section, conformance is defined through the reference implementation of the decoder provided in . Although this document includes an English description of the codec, should @@ -7030,55 +7208,64 @@ the latter shall take precedence. -Compliance with this specification means that a decoder's output MUST be +Compliance with this specification means that in addition to following the normative keywords in this document, + a decoder's output MUST also be within the thresholds specified by the opus_compare.c tool (included - with the code) when compared to the reference implementation for each of the - test vectors provided (see ). Either the floating-point - implementation or the fixed-point implementation can be used as a reference and being - within the threshold for one of the two is sufficient. In addition, a compliant + with the code) when compared to the reference implementation for each of the + test vectors provided (see ) and for each output + sampling rate and channel count supported. In addition, a compliant decoder implementation MUST have the same final range decoder state as that of the - reference decoder. + reference decoder. It is therefore RECOMMENDED that the + decoder implement the same functional behavior as the reference. + + A decoder implementation is not required to support all output sampling + rates or all output channel counts.
Using the reference code provided in , -a mono test vector can be decoded with +a test vector can be decoded with -opus_demo -d 48000 1 test_mono.bit test_mono.out +opus_demo -d <rate> <channels> testvectorX.bit testX.out +where <rate> is the sampling rate and can be 8000, 12000, 16000, 24000, or 48000, and +<channels> is 1 for mono or 2 for stereo. + + If the range decoder state is incorrect for one of the frames, the decoder will exit with "Error: Range coder state mismatch between encoder and decoder". If the decoder succeeds, then the output can be compared with the "reference" output with -opus_compare test_mono.float test_mono.out +opus_compare -s -r <rate> testvectorX.dec testX.out -or +for stereo or -opus_compare test_mono.fixed test_mono.out - - -For a stereo test vector, the command line for decoding is - -opus_demo -d 48000 2 test_stereo.bin test_stereo.out +opus_compare -r <rate> testvectorX.dec testX.out +for mono. + -and the output can be compared with the reference output with - -opus_compare -s test_stereo.float test_stereo.out - -or - -opus_compare -s test_stereo.fixed test_stereo.out - +In addition to indicating whether the test vector comparison passes, the opus_compare tool +outputs an "Opus quality metric" that indicates how well the tested decoder matches the +reference implementation. A quality of 0 corresponds to the passing threshold, while +a quality of 100 means that the output of the tested decoder is identical to the reference +implementation. The passing threshold was calibrated in such a way that it corresponds to +additive white noise with a 48 dB SNR (similar to what can be obtained on a cassette deck). +It is still possible for an implementation to sound very good with such a low quality measure +(e.g. if the deviation is due to inaudible phase distortion), but unless this is verified by +listening tests, it is RECOMMENDED that implementations achive a quality above 90 for 48 kHz +decoding. For other sampling rates, it is normal for the quality metric to be lower +(typically as low as 50 even for a good implementation) because of harmless mismatch with +the delay and phase of the internal sampling rate conversion. On POSIX environments, the run_vectors.sh script can be used to verify all test vectors. This can be done with -run_vectors.sh <exec path> <vector path> +run_vectors.sh <exec path> <vector path> <rate> where <exec path> is the directory where the opus_demo and opus_compare executables are built and <vector path> is the directory containing the test vectors. @@ -7120,8 +7307,8 @@ The reference implementation contains no known buffer overflow or cases where in CPU load. However, on certain CPU architectures where denormalized floating-point operations are much slower than normal floating-point operations, it is - possible for some audio content (e.g., silence or near-silence) to cause a certain - an increase in CPU load. + possible for some audio content (e.g., silence or near-silence) to cause an + increase in CPU load. Denormals can be introduced by reordering operations in the compiler and depend on the target architecture, so it is difficult to guarantee that an implementation avoids them. @@ -7195,7 +7382,7 @@ name of work, or endorsement information. - + @@ -7213,7 +7400,7 @@ name of work, or endorsement information. This document provides specific requirements for an Internet audio - codec. These requirements address quality, sample rate, bit-rate, + codec. These requirements address quality, sample rate, bitrate, and packet-loss robustness, as well as other desirable properties. @@ -7354,7 +7541,7 @@ Robust and Efficient Quantization of Speech LSP Parameters Using Structured Vect
This appendix contains the complete source code for the -reference implementation of the Opus codec written in C. By default, +reference implementation of the Opus codec written in C. By default, this implementation relies on floating-point arithmetic, but it can be compiled to use only fixed-point arithmetic by defining the FIXED_POINT macro. Information on building and using the reference implementation is @@ -7369,15 +7556,15 @@ but it is easy to substitute any other FFT library. -While the reference implementation does not rely on any +While the reference implementation does not rely on any undefined behavior as defined by C89 or C99, it relies on common implementation-defined behavior for two's complement architectures: -Right shifts of negative values are consistent with two's complement arithmetic, so that a>>b is equivalent to floor(a/(2^b)) -For conversion to a signed integer of N bits, the value is reduced modulo 2^N to be within range of the type -The result of integer division of a negative values is truncated towards zero -The compiler provides a 64-bit integer type (a C99 requirement which is supported by most C89 compilers) +Right shifts of negative values are consistent with two's complement arithmetic, so that a>>b is equivalent to floor(a/(2**b)), +For conversion to a signed integer of N bits, the value is reduced modulo 2**N to be within range of the type, +The result of integer division of a negative value is truncated towards zero, and +The compiler provides a 64-bit integer type (a C99 requirement which is supported by most C89 compilers). @@ -7385,9 +7572,9 @@ for two's complement architectures: In its current form, the reference implementation also requires the following architectural characteristics to obtain acceptable performance: -two's complement arithmetic -at least a 16 bit by 16 bit integer multiplier (32-bit result) -at least a 32-bit adder/accumulator +Two's complement arithmetic, +At least a 16 bit by 16 bit integer multiplier (32-bit result), and +At least a 32-bit adder/accumulator. @@ -7428,60 +7615,31 @@ Development snapshots are provided at
-
+
-
+
Because of size constraints, the Opus test vectors are not distributed in this -draft. They are available from the Opus codec website at +draft. They are available from the Opus codec website at and will also be made available in IETF meeting proceedings. These test vectors were created specifically to exercise all aspects of the decoder and therefore the audio quality of the decoded output is -significantly lower than what Opus can achieve in normal operation. +significantly lower than what Opus can achieve in normal operation. The SHA1 hash of the files in the test vector package are -
- -
+
+
- +
To use the internal framing described in , the decoder diff --git a/silk/dec_API.c b/silk/dec_API.c index a0b841ce..8c9ed24a 100644 --- a/silk/dec_API.c +++ b/silk/dec_API.c @@ -92,6 +92,7 @@ opus_int silk_Decode( /* O Returns error co silk_decoder *psDec = ( silk_decoder * )decState; silk_decoder_state *channel_state = psDec->channel_state; opus_int has_side; + opus_int stereo_to_mono; /**********************************/ /* Test if first frame in payload */ @@ -107,6 +108,9 @@ opus_int silk_Decode( /* O Returns error co ret += silk_init_decoder( &channel_state[ 1 ] ); } + stereo_to_mono = decControl->nChannelsInternal == 1 && psDec->nChannelsInternal == 2 && + ( decControl->internalSampleRate == 1000*channel_state[ 0 ].fs_kHz ); + if( channel_state[ 0 ].nFramesDecoded == 0 ) { for( n = 0; n < decControl->nChannelsInternal; n++ ) { opus_int fs_kHz_dec; @@ -293,7 +297,7 @@ opus_int silk_Decode( /* O Returns error co ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec ); /* Interleave if stereo output and stereo stream */ - if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) { + if( decControl->nChannelsAPI == 2 ) { for( i = 0; i < *nSamplesOut; i++ ) { samplesOut[ n + 2 * i ] = resample_out_ptr[ i ]; } @@ -302,8 +306,18 @@ opus_int silk_Decode( /* O Returns error co /* Create two channel output from mono stream */ if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 1 ) { - for( i = 0; i < *nSamplesOut; i++ ) { - samplesOut[ 0 + 2 * i ] = samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ]; + if ( stereo_to_mono ){ + /* Resample right channel for newly collapsed stereo just in case + we weren't doing collapsing when switching to mono */ + ret += silk_resampler( &channel_state[ 1 ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ 0 ][ 1 ], nSamplesOutDec ); + + for( i = 0; i < *nSamplesOut; i++ ) { + samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ]; + } + } else { + for( i = 0; i < *nSamplesOut; i++ ) { + samplesOut[ 1 + 2 * i ] = samplesOut[ 0 + 2 * i ]; + } } } diff --git a/silk/decoder_set_fs.c b/silk/decoder_set_fs.c index e0a343f8..c0bf352b 100644 --- a/silk/decoder_set_fs.c +++ b/silk/decoder_set_fs.c @@ -49,25 +49,9 @@ opus_int silk_decoder_set_fs( /* Initialize resampler when switching internal or external sampling frequency */ if( psDec->fs_kHz != fs_kHz || psDec->fs_API_hz != fs_API_Hz ) { - /* Allocate worst case space for temporary upsampling, 8 to 48 kHz, so a factor 6 */ - opus_int16 temp_buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ ]; - silk_resampler_state_struct temp_resampler_state; - - if( psDec->fs_kHz != fs_kHz && psDec->fs_kHz > 0 ) { - /* Initialize resampler for temporary resampling of outBuf data to the new internal sampling rate */ - ret += silk_resampler_init( &temp_resampler_state, silk_SMULBB( psDec->fs_kHz, 1000 ), silk_SMULBB( fs_kHz, 1000 ), 0 ); - - /* Temporary resampling of outBuf data to the new internal sampling rate */ - silk_memcpy( temp_buf, psDec->outBuf, psDec->frame_length * sizeof( opus_int16 ) ); - ret += silk_resampler( &temp_resampler_state, psDec->outBuf, temp_buf, psDec->frame_length ); - } - /* Initialize the resampler for dec_API.c preparing resampling from fs_kHz to API_fs_Hz */ ret += silk_resampler_init( &psDec->resampler_state, silk_SMULBB( fs_kHz, 1000 ), fs_API_Hz, 0 ); - /* Correct resampler state by resampling buffered data from fs_kHz to API_fs_Hz */ - ret += silk_resampler( &psDec->resampler_state, temp_buf, psDec->outBuf, frame_length ); - psDec->fs_API_hz = fs_API_Hz; } diff --git a/src/opus_compare.c b/src/opus_compare.c index a74acb0e..b8a16202 100644 --- a/src/opus_compare.c +++ b/src/opus_compare.c @@ -133,7 +133,7 @@ static const int BANDS[NBANDS+1]={ }; #define TEST_WIN_SIZE (480) -#define TEST_WIN_STEP (TEST_WIN_SIZE>>1) +#define TEST_WIN_STEP (120) int main(int _argc,const char **_argv){ FILE *fin1; @@ -143,7 +143,7 @@ int main(int _argc,const char **_argv){ float *xb; float *X; float *Y; - float err; + double err; float Q; size_t xlength; size_t ylength; @@ -246,14 +246,15 @@ int main(int _argc,const char **_argv){ } } if(xi>0){ - /*Temporal masking: 5 dB/5ms slope.*/ + /*Temporal masking: -3 dB/2.5ms slope.*/ for(bi=0;bi=79&&xj<=81)im*=0.1F; - if(xj==80)im*=0.1F; - Ef+=im*im; + for(bi=0;bi=79&&xj<=81)im*=0.1F; + if(xj==80)im*=0.1F; + Eb+=im; + } } + Eb /= (BANDS[bi+1]-BANDS[bi])*nchannels; + Ef += Eb*Eb; } /*Using a fixed normalization value means we're willing to accept slightly lower quality for lower sampling rates.*/ - Ef/=200*nchannels; + Ef/=NBANDS; Ef*=Ef; err+=Ef*Ef; } diff --git a/src/opus_decoder.c b/src/opus_decoder.c index ab79f427..889b5a4f 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -427,7 +427,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data, pcm[i] = 0; /* For hybrid -> SILK transitions, we let the CELT MDCT do a fade-out by decoding a silence frame */ - if (st->prev_mode == MODE_HYBRID) + if (st->prev_mode == MODE_HYBRID && !(redundancy && celt_to_silk && st->prev_redundancy) ) { celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)); celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL); diff --git a/src/opus_demo.c b/src/opus_demo.c index f97648c0..34fba5ca 100644 --- a/src/opus_demo.c +++ b/src/opus_demo.c @@ -102,6 +102,103 @@ static void check_encoder_option(int decode_only, const char *opt) } } +int silk8_test[][4] = { + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*3, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*2, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*3, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*2, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 2} +}; + +int silk12_test[][4] = { + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*3, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*2, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 480, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*3, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*2, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 480, 2} +}; + +int silk16_test[][4] = { + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*3, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*2, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*3, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*2, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 2} +}; + +int hybrid24_test[][4] = { + {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 2} +}; + +int hybrid48_test[][4] = { + {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 1}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 2}, + {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 2} +}; + +int celt_test[][4] = { + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 1}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 1}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 240, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 240, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 240, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 240, 1}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 120, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 120, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 120, 1}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 120, 1}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 2}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 2}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 240, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 240, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 240, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 240, 2}, + + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 120, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 120, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 120, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 120, 2}, + +}; + +int celt_hq_test[][4] = { + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 240, 2}, + {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 120, 2}, +}; + int main(int argc, char *argv[]) { int err; @@ -143,6 +240,11 @@ int main(int argc, char *argv[]) int random_framesize=0, newsize=0, delayed_celt=0; int sweep_max=0, sweep_min=0; int random_fec=0; + int (*mode_list)[4]=NULL; + int nb_modes_in_list=0; + int curr_mode=0; + int curr_mode_count=0; + int mode_switch_time = 48000; if (argc < 5 ) { @@ -302,6 +404,41 @@ int main(int argc, char *argv[]) check_encoder_option(decode_only, "-random_fec"); random_fec = 1; args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-silk8k_test" ) == 0 ) { + check_encoder_option(decode_only, "-silk8k_test"); + mode_list = silk8_test; + nb_modes_in_list = 8; + args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-silk12k_test" ) == 0 ) { + check_encoder_option(decode_only, "-silk12k_test"); + mode_list = silk12_test; + nb_modes_in_list = 8; + args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-silk16k_test" ) == 0 ) { + check_encoder_option(decode_only, "-silk16k_test"); + mode_list = silk16_test; + nb_modes_in_list = 8; + args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-hybrid24k_test" ) == 0 ) { + check_encoder_option(decode_only, "-hybrid24k_test"); + mode_list = hybrid24_test; + nb_modes_in_list = 4; + args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-hybrid48k_test" ) == 0 ) { + check_encoder_option(decode_only, "-hybrid48k_test"); + mode_list = hybrid48_test; + nb_modes_in_list = 4; + args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-celt_test" ) == 0 ) { + check_encoder_option(decode_only, "-celt_test"); + mode_list = celt_test; + nb_modes_in_list = 32; + args++; + } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "-celt_hq_test" ) == 0 ) { + check_encoder_option(decode_only, "-celt_hq_test"); + mode_list = celt_hq_test; + nb_modes_in_list = 4; + args++; } else { printf( "Error: unrecognized setting: %s\n\n", argv[ args ] ); print_usage( argv ); @@ -326,6 +463,17 @@ int main(int argc, char *argv[]) fprintf (stderr, "Could not open input file %s\n", argv[argc-2]); return EXIT_FAILURE; } + if (mode_list) + { + int size; + fseek(fin, 0, SEEK_END); + size = ftell(fin); + fprintf(stderr, "File size is %d bytes\n", size); + fseek(fin, 0, SEEK_SET); + mode_switch_time = size/sizeof(short)/channels/nb_modes_in_list; + fprintf(stderr, "Switching mode every %d samples\n", mode_switch_time); + } + outFile = argv[argc-1]; fout = fopen(outFile, "wb+"); if (!fout) @@ -428,6 +576,8 @@ int main(int argc, char *argv[]) case 4: newsize=sampling_rate/25; break; case 5: newsize=3*sampling_rate/50; break; } + while (newsize < sampling_rate/25 && bitrate_bps-fabs(sweep_bps) <= 3*12*sampling_rate/newsize) + newsize*=2; if (newsize < sampling_rate/100 && frame_size >= sampling_rate/100) { opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(MODE_CELT_ONLY)); @@ -463,6 +613,13 @@ int main(int argc, char *argv[]) break; } } else { + if (mode_list!=NULL) + { + opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(mode_list[curr_mode][1])); + opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(mode_list[curr_mode][0])); + opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3])); + frame_size = mode_list[curr_mode][2]; + } err = fread(in, sizeof(short)*channels, frame_size, fin); curr_read = err; if (curr_read < frame_size) @@ -472,7 +629,6 @@ int main(int argc, char *argv[]) in[i] = 0; stop = 1; } - len[toggle] = opus_encode(enc, in, frame_size, data[toggle], max_payload_bytes); if (sweep_bps!=0) { @@ -497,6 +653,12 @@ int main(int argc, char *argv[]) fclose(fout); return EXIT_FAILURE; } + curr_mode_count += frame_size; + if (curr_mode_count > mode_switch_time && curr_mode < nb_modes_in_list-1) + { + curr_mode++; + curr_mode_count = 0; + } } if (encode_only) diff --git a/tests/run_vectors.sh b/tests/run_vectors.sh index 81b68f3c..9b5c29be 100755 --- a/tests/run_vectors.sh +++ b/tests/run_vectors.sh @@ -1,12 +1,16 @@ #!/bin/sh -if [ "$#" -ne "2" ]; then - echo "usage: run_vectors.sh " +rm logs_mono.txt +rm logs_stereo.txt + +if [ "$#" -ne "3" ]; then + echo "usage: run_vectors.sh " exit 1 fi CMD_PATH=$1 VECTOR_PATH=$2 +RATE=$3 OPUS_DEMO=$CMD_PATH/opus_demo OPUS_COMPARE=$CMD_PATH/opus_compare @@ -32,24 +36,23 @@ echo Testing mono echo "==============" echo -for file in test1_mono test2_mono test3_mono test4_mono test5_mono +for file in `seq -w 1 11` do - if [ -e $VECTOR_PATH/$file.bit ]; then - echo Testing $file + if [ -e $VECTOR_PATH/testvector$file.bit ]; then + echo Testing testvector$file else - echo Bitstream file not found: $file + echo Bitstream file not found: testvector$file.bit fi - if $OPUS_DEMO -d 48000 1 $VECTOR_PATH/$file.bit tmp.out > /dev/null 2>&1; then + if $OPUS_DEMO -d $RATE 1 $VECTOR_PATH/testvector$file.bit tmp.out >> logs_mono.txt 2>&1; then echo successfully decoded else echo ERROR: decoding failed exit 1 fi - $OPUS_COMPARE $VECTOR_PATH/$file.float tmp.out > /dev/null 2>&1 + $OPUS_COMPARE -r $RATE $VECTOR_PATH/testvector$file.dec tmp.out >> logs_mono.txt 2>&1 + true float_ret=$? - $OPUS_COMPARE $VECTOR_PATH/$file.fixed tmp.out > /dev/null 2>&1 - fixed_ret=$? - if [ "$float_ret" -eq "0" -o "$fixed_ret" -eq "0" ]; then + if [ "$float_ret" -eq "0" ]; then echo output matches reference else echo ERROR: output does not match reference @@ -63,24 +66,22 @@ echo Testing stereo echo "==============" echo -for file in test1_stereo test2_stereo test3_stereo test4_stereo +for file in `seq -w 1 11` do - if [ -e $VECTOR_PATH/$file.bit ]; then - echo Testing $file + if [ -e $VECTOR_PATH/testvector$file.bit ]; then + echo Testing testvector$file else - echo Bitstream file not found: $file + echo Bitstream file not found: testvector$file fi - if $OPUS_DEMO -d 48000 2 $VECTOR_PATH/$file.bit tmp.out > /dev/null 2>&1; then + if $OPUS_DEMO -d $RATE 2 $VECTOR_PATH/testvector$file.bit tmp.out >> logs_stereo.txt 2>&1; then echo successfully decoded else echo ERROR: decoding failed exit 1 fi - $OPUS_COMPARE -s $VECTOR_PATH/$file.float tmp.out > /dev/null 2>&1 + $OPUS_COMPARE -s -r $RATE $VECTOR_PATH/testvector$file.dec tmp.out >> logs_stereo.txt 2>&1 float_ret=$? - $OPUS_COMPARE -s $VECTOR_PATH/$file.fixed tmp.out > /dev/null 2>&1 - fixed_ret=$? - if [ "$float_ret" -eq "0" -o "$fixed_ret" -eq "0" ]; then + if [ "$float_ret" -eq "0" ]; then echo output matches reference else echo ERROR: output does not match reference @@ -92,3 +93,5 @@ done echo All tests have passed successfully +grep quality logs_mono.txt | awk '{sum+=$4}END{print "Average mono quality is", sum/NR, "%"}' +grep quality logs_stereo.txt | awk '{sum+=$4}END{print "Average stereo quality is", sum/NR, "%"}' -- cgit v1.2.3