Merge LPCNet repo into Opus

Repo was filtered to move everything to the dnn/ directory
author: Jean-Marc Valin <jmvalin@amazon.com> 2023-06-21 20:08:08 +0300
committer: Jean-Marc Valin <jmvalin@amazon.com> 2023-06-21 20:08:08 +0300
commit: 7458c1bdc667564764d630f8ea08933a9c5e9319 (patch)
tree: 54dbce728f012af69e2ee37ab6196425aeee1e0a
parent: 7dae7b178ca8e6837429d8ed97464ed2ce4e1720 (diff)
parent: 3bd86da9a91025da563f1ab1d9b8c1f93ecf6c29 (diff)
106 files changed, 17417 insertions, 0 deletions
diff --git a/dnn/AUTHORS b/dnn/AUTHORS
new file mode 100644
index 00000000..8ee917e5
--- /dev/null
+++ b/dnn/AUTHORS
@@ -0,0 +1,2 @@
+Jean-Marc Valin <jmvalin@jmvalin.ca>
+David Rowe <david@rowetel.com>
diff --git a/dnn/COPYING b/dnn/COPYING
new file mode 100644
index 00000000..feef112b
--- /dev/null
+++ b/dnn/COPYING
@@ -0,0 +1,31 @@
+Copyright (c) 2017-2018, Mozilla
+Copyright (c) 2007-2017, Jean-Marc Valin
+Copyright (c) 2005-2017, Xiph.Org Foundation
+Copyright (c) 2003-2004, Mark Borgerding
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the Xiph.Org Foundation nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/dnn/LPCNet.yml b/dnn/LPCNet.yml
new file mode 100644
index 00000000..3a726657
--- /dev/null
+++ b/dnn/LPCNet.yml
@@ -0,0 +1,24 @@
+#
+# install
+# conda env create -f=LPCNet.yml
+#
+# update
+# conda env update -f=LPCNet.yml
+#
+# activate
+# conda activate LPCNet
+#
+# remove
+# conda remove --name LPCNet --all
+#
+name: LPCNet
+channels:
+  - anaconda
+  - conda-forge
+dependencies:
+  - keras==2.2.4
+  - python>=3.6
+  - tensorflow-gpu==1.12.0
+  - cudatoolkit
+  - h5py
+  - numpy
diff --git a/dnn/Makefile.am b/dnn/Makefile.am
new file mode 100644
index 00000000..0a2b0653
--- /dev/null
+++ b/dnn/Makefile.am
@@ -0,0 +1,158 @@
+ACLOCAL_AMFLAGS = -I m4
+
+AM_CFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/ $(DEPS_CFLAGS)
+
+dist_doc_DATA = COPYING AUTHORS README README.md
+
+include_HEADERS = include/lpcnet.h
+
+lib_LTLIBRARIES = liblpcnet.la
+noinst_HEADERS = arch.h  \
+		 burg.h \
+		 common.h  \
+		 freq.h  \
+		 _kiss_fft_guts.h  \
+		 kiss_fft.h  \
+		 kiss99.h  \
+		 lpcnet_private.h \
+		 opus_types.h  \
+		 nnet_data.h  \
+		 plc_data.h  \
+		 nnet.h  \
+		 pitch.h  \
+		 tansig_table.h \
+		 vec.h \
+		 vec_avx.h \
+		 vec_neon.h
+
+liblpcnet_la_SOURCES = \
+	burg.c \
+	common.c \
+	kiss99.c \
+	lpcnet.c \
+	lpcnet_dec.c \
+	lpcnet_enc.c \
+	lpcnet_tables.c \
+	nnet.c \
+	nnet_data.c \
+	plc_data.c \
+	ceps_codebooks.c \
+	pitch.c \
+	parse_lpcnet_weights.c \
+	freq.c \
+	kiss_fft.c \
+	lpcnet_plc.c
+
+liblpcnet_la_LIBADD = $(DEPS_LIBS) $(lrintf_lib) $(LIBM)
+liblpcnet_la_LDFLAGS = -no-undefined \
+ -version-info @OP_LT_CURRENT@:@OP_LT_REVISION@:@OP_LT_AGE@
+
+noinst_PROGRAMS = lpcnet_demo dump_data dump_weights_blob
+
+lpcnet_demo_SOURCES = lpcnet_demo.c
+lpcnet_demo_LDADD = liblpcnet.la
+
+
+#DUMP_SOURCES = freq.c kiss_fft.c pitch.c lpcnet_dec.c lpcnet_enc.c ceps_codebooks.c common.c
+#DUMP_OBJ = $(DUMP_SOURCES:.c=.lo)
+#dump_data_SOURCES = dump_data.c
+#dump_data_LDADD = $(DUMP_OBJ) $(LIBM)
+
+dump_data_SOURCES = common.c dump_data.c burg.c freq.c kiss_fft.c pitch.c lpcnet_dec.c lpcnet_enc.c lpcnet_tables.c ceps_codebooks.c
+dump_data_LDADD = $(LIBM)
+dump_data_CFLAGS = $(AM_CFLAGS)
+
+dump_weights_blob_SOURCES = write_lpcnet_weights.c
+dump_weights_blob_LDADD = $(LIBM)
+dump_weights_blob_CFLAGS = $(AM_CFLAGS) -DDUMP_BINARY_WEIGHTS
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = lpcnet.pc
+
+debug:
+	$(MAKE) CFLAGS="${CFLAGS} -O0 -ggdb -DOP_ENABLE_ASSERTIONS" all
+
+EXTRA_DIST = \
+ lpcnet.pc.in \
+ lpcnet-uninstalled.pc.in \
+ doc/Doxyfile.in \
+ doc/Makefile
+
+# Targets to build and install just the library without the docs
+liblpcnet install-liblpcnet: NO_DOXYGEN = 1
+
+lpcnet: all
+install-lpcnet: install
+
+# Or just the docs
+docs: doc/doxygen-build.stamp
+
+install-docs:
+	@if [ -z "$(NO_DOXYGEN)" ]; then \
+	  ( cd doc && \
+	    echo "Installing documentation in $(DESTDIR)$(docdir)"; \
+	    $(INSTALL) -d $(DESTDIR)$(docdir)/html/search; \
+	    for f in `find html -type f \! -name "installdox"` ; do \
+	      $(INSTALL_DATA) $$f $(DESTDIR)$(docdir)/$$f;     \
+	    done ) \
+	fi
+
+doc/doxygen-build.stamp: doc/Doxyfile \
+                         $(top_srcdir)/include/*.h
+	@[ -n "$(NO_DOXYGEN)" ] || ( cd doc && doxygen && touch $(@F) )
+
+
+if HAVE_DOXYGEN
+
+# Or everything (by default)
+all-local: docs
+
+install-data-local: install-docs
+
+clean-local:
+	$(RM) -r doc/html
+	$(RM) -r doc/latex
+	$(RM) doc/doxygen-build.stamp
+
+uninstall-local:
+	$(RM) -r $(DESTDIR)$(docdir)/html
+
+endif
+
+# We check this every time make is run, with configure.ac being touched to
+# trigger an update of the build system files if update_version changes the
+# current PACKAGE_VERSION (or if package_version was modified manually by a
+# user with either AUTO_UPDATE=no or no update_version script present - the
+# latter being the normal case for tarball releases).
+#
+# We can't just add the package_version file to CONFIGURE_DEPENDENCIES since
+# simply running autoconf will not actually regenerate configure for us when
+# the content of that file changes (due to autoconf dependency checking not
+# knowing about that without us creating yet another file for it to include).
+#
+# The MAKECMDGOALS check is a gnu-make'ism, but will degrade 'gracefully' for
+# makes that don't support it.  The only loss of functionality is not forcing
+# an update of package_version for `make dist` if AUTO_UPDATE=no, but that is
+# unlikely to be a real problem for any real user.
+$(top_srcdir)/configure.ac: force
+	@case "$(MAKECMDGOALS)" in \
+	    dist-hook)                             exit 0       ;; \
+	    dist-* | dist | distcheck | distclean) _arg=release ;; \
+	esac; \
+	if ! $(top_srcdir)/update_version $$_arg 2> /dev/null; then \
+	    if [ ! -e $(top_srcdir)/package_version ]; then \
+		echo 'PACKAGE_VERSION="unknown"' > $(top_srcdir)/package_version; \
+	    fi; \
+	    . $(top_srcdir)/package_version || exit 1; \
+	    [ "$(PACKAGE_VERSION)" != "$$PACKAGE_VERSION" ] || exit 0; \
+	fi; \
+	touch $@
+
+force:
+
+# Create a minimal package_version file when make dist is run.
+dist-hook:
+	echo 'PACKAGE_VERSION="$(PACKAGE_VERSION)"' > $(top_distdir)/package_version
+
+
+.PHONY: lpcnet install-lpcnet docs install-docs
diff --git a/dnn/README b/dnn/README
new file mode 100644
index 00000000..96dc92fb
--- /dev/null
+++ b/dnn/README
@@ -0,0 +1 @@
+See README.md
diff --git a/dnn/README.md b/dnn/README.md
new file mode 100644
index 00000000..930b043c
--- /dev/null
+++ b/dnn/README.md
@@ -0,0 +1,127 @@
+# LPCNet
+
+Low complexity implementation of the WaveRNN-based LPCNet algorithm, as described in:
+
+- J.-M. Valin, J. Skoglund, [LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://jmvalin.ca/papers/lpcnet_icassp2019.pdf), *Proc. International Conference on Acoustics, Speech and Signal Processing (ICASSP)*, arXiv:1810.11846, 2019.
+- J.-M. Valin, U. Isik, P. Smaragdis, A. Krishnaswamy, [Neural Speech Synthesis on a Shoestring: Improving the Efficiency of LPCNet](https://jmvalin.ca/papers/improved_lpcnet.pdf), *Proc. ICASSP*, arxiv:2106.04129, 2022.
+- K. Subramani, J.-M. Valin, U. Isik, P. Smaragdis, A. Krishnaswamy, [End-to-end LPCNet: A Neural Vocoder With Fully-Differentiable LPC Estimation](https://jmvalin.ca/papers/lpcnet_end2end.pdf), *Proc. INTERSPEECH*, arxiv:2106.04129, 2022.
+
+For coding/PLC applications of LPCNet, see:
+
+- J.-M. Valin, J. Skoglund, [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://jmvalin.ca/papers/lpcnet_codec.pdf), *Proc. INTERSPEECH*, arxiv:1903.12087, 2019.
+- J. Skoglund, J.-M. Valin, [Improving Opus Low Bit Rate Quality with Neural Speech Synthesis](https://jmvalin.ca/papers/opusnet.pdf), *Proc. INTERSPEECH*, arxiv:1905.04628, 2020.
+- J.-M. Valin, A. Mustafa, C. Montgomery, T.B. Terriberry, M. Klingbeil, P. Smaragdis, A. Krishnaswamy, [Real-Time Packet Loss Concealment With Mixed Generative and Predictive Model](https://jmvalin.ca/papers/lpcnet_plc.pdf), *Proc. INTERSPEECH*, arxiv:2205.05785, 2022.
+- J.-M. Valin, J. Büthe, A. Mustafa, [Low-Bitrate Redundancy Coding of Speech Using a Rate-Distortion-Optimized Variational Autoencoder](https://jmvalin.ca/papers/valin_dred.pdf), *Proc. ICASSP*, arXiv:2212.04453, 2023. ([blog post](https://www.amazon.science/blog/neural-encoding-enables-more-efficient-recovery-of-lost-audio-packets))
+
+# Introduction
+
+Work in progress software for researching low CPU complexity algorithms for speech synthesis and compression by applying Linear Prediction techniques to WaveRNN. High quality speech can be synthesised on regular CPUs (around 3 GFLOP) with SIMD support (SSE2, SSSE3, AVX, AVX2/FMA, NEON currently supported). The code also supports very low bitrate compression at 1.6 kb/s.
+
+The BSD licensed software is written in C and Python/Keras. For training, a GTX 1080 Ti or better is recommended.
+
+This software is an open source starting point for LPCNet/WaveRNN-based speech synthesis and coding.
+
+# Using the existing software
+
+You can build the code using:
+
+```
+./autogen.sh
+./configure
+make
+```
+Note that the autogen.sh script is used when building from Git and will automatically download the latest model
+(models are too large to put in Git). By default, LPCNet will attempt to use 8-bit dot product instructions on AVX\*/Neon to
+speed up inference. To disable that (e.g. to avoid quantization effects when retraining), add --disable-dot-product to the
+configure script. LPCNet does not yet have a complete implementation for some of the integer operations on the ARMv7
+architecture so for now you will also need --disable-dot-product to successfully compile on 32-bit ARM.
+
+It is highly recommended to set the CFLAGS environment variable to enable AVX or NEON *prior* to running configure, otherwise
+no vectorization will take place and the code will be very slow. On a recent x86 CPU, something like
+```
+export CFLAGS='-Ofast -g -march=native'
+```
+should work. On ARM, you can enable Neon with:
+```
+export CFLAGS='-Ofast -g -mfpu=neon'
+```
+While not strictly required, the -Ofast flag will help with auto-vectorization, especially for dot products that
+cannot be optimized without -ffast-math (which -Ofast enables). Additionally, -falign-loops=32 has been shown to
+help on x86.
+
+You can test the capabilities of LPCNet using the lpcnet\_demo application. To encode a file:
+```
+./lpcnet_demo -encode input.pcm compressed.bin
+```
+where input.pcm is a 16-bit (machine endian) PCM file sampled at 16 kHz. The raw compressed data (no header)
+is written to compressed.bin and consists of 8 bytes per 40-ms packet.
+
+To decode:
+```
+./lpcnet_demo -decode compressed.bin output.pcm
+```
+where output.pcm is also 16-bit, 16 kHz PCM.
+
+Alternatively, you can run the uncompressed analysis/synthesis using -features
+instead of -encode and -synthesis instead of -decode.
+The same functionality is available in the form of a library. See include/lpcnet.h for the API.
+
+To try packet loss concealment (PLC), you first need a PLC model, which you can get with:
+```
+./download_model.sh plc-3b1eab4
+```
+or (for the PLC challenge submission):
+```
+./download_model.sh plc_challenge
+```
+PLC can be tested with:
+```
+./lpcnet_demo -plc_file noncausal_dc error_pattern.txt input.pcm output.pcm
+```
+where error_pattern.txt is a text file with one entry per 20-ms packet, with 1 meaning "packet lost" and 0 meaning "packet not lost".
+noncausal_dc is the non-causal (5-ms look-ahead) with special handling for DC offsets. It's also possible to use "noncausal", "causal",
+or "causal_dc".
+
+# Training a new model
+
+This codebase is also meant for research and it is possible to train new models. These are the steps to do that:
+
+1. Set up a Keras system with GPU.
+
+1. Generate training data:
+   ```
+   ./dump_data -train input.s16 features.f32 data.s16
+   ```
+   where the first file contains 16 kHz 16-bit raw PCM audio (no header) and the other files are output files. This program makes several passes over the data with different filters to generate a large amount of training data.
+
+1. Now that you have your files, train with:
+   ```
+   python3 training_tf2/train_lpcnet.py features.f32 data.s16 model_name
+   ```
+   and it will generate an h5 file for each iteration, with model\_name as prefix. If it stops with a
+   "Failed to allocate RNN reserve space" message try specifying a smaller --batch-size for  train\_lpcnet.py.
+
+1. You can synthesise speech with Python and your GPU card (very slow):
+   ```
+   ./dump_data -test test_input.s16 test_features.f32
+   ./training_tf2/test_lpcnet.py lpcnet_model_name.h5 test_features.f32 test.s16
+   ```
+
+1. Or with C on a CPU (C inference is much faster):
+   First extract the model files nnet\_data.h and nnet\_data.c
+   ```
+   ./training_tf2/dump_lpcnet.py lpcnet_model_name.h5
+   ```
+   and move the generated nnet\_data.\* files to the src/ directory.
+   Then you just need to rebuild the software and use lpcnet\_demo as explained above.
+
+# Speech Material for Training 
+
+Suitable training material can be obtained from [Open Speech and Language Resources](https://www.openslr.org/).  See the datasets.txt file for details on suitable training data.
+
+# Reading Further
+
+1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)
+1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/)
+1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/ 
+
diff --git a/dnn/_kiss_fft_guts.h b/dnn/_kiss_fft_guts.h
new file mode 100644
index 00000000..17392b3e
--- /dev/null
+++ b/dnn/_kiss_fft_guts.h
@@ -0,0 +1,182 @@
+/*Copyright (c) 2003-2004, Mark Borgerding
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_GUTS_H
+#define KISS_FFT_GUTS_H
+
+#define MIN(a,b) ((a)<(b) ? (a):(b))
+#define MAX(a,b) ((a)>(b) ? (a):(b))
+
+/* kiss_fft.h
+   defines kiss_fft_scalar as either short or a float type
+   and defines
+   typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
+#include "kiss_fft.h"
+
+/*
+  Explanation of macros dealing with complex math:
+
+   C_MUL(m,a,b)         : m = a*b
+   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
+   C_SUB( res, a,b)     : res = a - b
+   C_SUBFROM( res , a)  : res -= a
+   C_ADDTO( res , a)    : res += a
+ * */
+#ifdef FIXED_POINT
+#include "arch.h"
+
+
+#define SAMP_MAX 2147483647
+#define TWID_MAX 32767
+#define TRIG_UPSCALE 1
+
+#define SAMP_MIN -SAMP_MAX
+
+
+#   define S_MUL(a,b) MULT16_32_Q15(b, a)
+
+#   define C_MUL(m,a,b) \
+      do{ (m).r = SUB32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
+          (m).i = ADD32_ovflw(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
+
+#   define C_MULC(m,a,b) \
+      do{ (m).r = ADD32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
+          (m).i = SUB32_ovflw(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
+
+#   define C_MULBYSCALAR( c, s ) \
+      do{ (c).r =  S_MUL( (c).r , s ) ;\
+          (c).i =  S_MUL( (c).i , s ) ; }while(0)
+
+#   define DIVSCALAR(x,k) \
+        (x) = S_MUL(  x, (TWID_MAX-((k)>>1))/(k)+1 )
+
+#   define C_FIXDIV(c,div) \
+        do {    DIVSCALAR( (c).r , div);  \
+                DIVSCALAR( (c).i  , div); }while (0)
+
+#define  C_ADD( res, a,b)\
+    do {(res).r=ADD32_ovflw((a).r,(b).r);  (res).i=ADD32_ovflw((a).i,(b).i); \
+    }while(0)
+#define  C_SUB( res, a,b)\
+    do {(res).r=SUB32_ovflw((a).r,(b).r);  (res).i=SUB32_ovflw((a).i,(b).i); \
+    }while(0)
+#define C_ADDTO( res , a)\
+    do {(res).r = ADD32_ovflw((res).r, (a).r);  (res).i = ADD32_ovflw((res).i,(a).i);\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {(res).r = ADD32_ovflw((res).r,(a).r);  (res).i = SUB32_ovflw((res).i,(a).i); \
+    }while(0)
+
+#if defined(OPUS_ARM_INLINE_ASM)
+#include "arm/kiss_fft_armv4.h"
+#endif
+
+#if defined(OPUS_ARM_INLINE_EDSP)
+#include "arm/kiss_fft_armv5e.h"
+#endif
+#if defined(MIPSr1_ASM)
+#include "mips/kiss_fft_mipsr1.h"
+#endif
+
+#else  /* not FIXED_POINT*/
+
+#   define S_MUL(a,b) ( (a)*(b) )
+#define C_MUL(m,a,b) \
+    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
+        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
+#define C_MULC(m,a,b) \
+    do{ (m).r = (a).r*(b).r + (a).i*(b).i;\
+        (m).i = (a).i*(b).r - (a).r*(b).i; }while(0)
+
+#define C_MUL4(m,a,b) C_MUL(m,a,b)
+
+#   define C_FIXDIV(c,div) /* NOOP */
+#   define C_MULBYSCALAR( c, s ) \
+    do{ (c).r *= (s);\
+        (c).i *= (s); }while(0)
+#endif
+
+#ifndef CHECK_OVERFLOW_OP
+#  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
+#endif
+
+#ifndef C_ADD
+#define  C_ADD( res, a,b)\
+    do { \
+            CHECK_OVERFLOW_OP((a).r,+,(b).r)\
+            CHECK_OVERFLOW_OP((a).i,+,(b).i)\
+            (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
+    }while(0)
+#define  C_SUB( res, a,b)\
+    do { \
+            CHECK_OVERFLOW_OP((a).r,-,(b).r)\
+            CHECK_OVERFLOW_OP((a).i,-,(b).i)\
+            (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
+    }while(0)
+#define C_ADDTO( res , a)\
+    do { \
+            CHECK_OVERFLOW_OP((res).r,+,(a).r)\
+            CHECK_OVERFLOW_OP((res).i,+,(a).i)\
+            (res).r += (a).r;  (res).i += (a).i;\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {\
+            CHECK_OVERFLOW_OP((res).r,-,(a).r)\
+            CHECK_OVERFLOW_OP((res).i,-,(a).i)\
+            (res).r -= (a).r;  (res).i -= (a).i; \
+    }while(0)
+#endif /* C_ADD defined */
+
+#ifdef FIXED_POINT
+/*#  define KISS_FFT_COS(phase)  TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * cos (phase))))
+#  define KISS_FFT_SIN(phase)  TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * sin (phase))))*/
+#  define KISS_FFT_COS(phase)  floor(.5+TWID_MAX*cos (phase))
+#  define KISS_FFT_SIN(phase)  floor(.5+TWID_MAX*sin (phase))
+#  define HALF_OF(x) ((x)>>1)
+#elif defined(USE_SIMD)
+#  define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
+#  define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
+#  define HALF_OF(x) ((x)*_mm_set1_ps(.5f))
+#else
+#  define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
+#  define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
+#  define HALF_OF(x) ((x)*.5f)
+#endif
+
+#define  kf_cexp(x,phase) \
+        do{ \
+                (x)->r = KISS_FFT_COS(phase);\
+                (x)->i = KISS_FFT_SIN(phase);\
+        }while(0)
+
+#define  kf_cexp2(x,phase) \
+   do{ \
+      (x)->r = TRIG_UPSCALE*celt_cos_norm((phase));\
+      (x)->i = TRIG_UPSCALE*celt_cos_norm((phase)-32768);\
+}while(0)
+
+#endif /* KISS_FFT_GUTS_H */
diff --git a/dnn/arch.h b/dnn/arch.h
new file mode 100644
index 00000000..0f29b49f
--- /dev/null
+++ b/dnn/arch.h
@@ -0,0 +1,245 @@
+/* Copyright (c) 2003-2008 Jean-Marc Valin
+   Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file arch.h
+   @brief Various architecture definitions for CELT
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef ARCH_H
+#define ARCH_H
+
+#include "opus_types.h"
+#include "common.h"
+
+# if !defined(__GNUC_PREREQ)
+#  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
+#   define __GNUC_PREREQ(_maj,_min) \
+ ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
+#  else
+#   define __GNUC_PREREQ(_maj,_min) 0
+#  endif
+# endif
+
+#define CELT_SIG_SCALE 32768.f
+
+#define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__);
+#ifdef ENABLE_ASSERTIONS
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __GNUC__
+__attribute__((noreturn))
+#endif
+static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)
+{
+   fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
+   abort();
+}
+#define celt_assert(cond) {if (!(cond)) {celt_fatal("assertion failed: " #cond);}}
+#define celt_assert2(cond, message) {if (!(cond)) {celt_fatal("assertion failed: " #cond "\n" message);}}
+#else
+#define celt_assert(cond)
+#define celt_assert2(cond, message)
+#endif
+
+#define IMUL32(a,b) ((a)*(b))
+
+#define MIN16(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 16-bit value.   */
+#define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
+#define MIN32(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 32-bit value.   */
+#define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
+#define IMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum int value.   */
+#define IMAX(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum int value.   */
+#define UADD32(a,b) ((a)+(b))
+#define USUB32(a,b) ((a)-(b))
+
+/* Set this if opus_int64 is a native type of the CPU. */
+/* Assume that all LP64 architectures have fast 64-bit types; also x86_64
+   (which can be ILP32 for x32) and Win64 (which is LLP64). */
+#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
+#define OPUS_FAST_INT64 1
+#else
+#define OPUS_FAST_INT64 0
+#endif
+
+#define PRINT_MIPS(file)
+
+#ifdef FIXED_POINT
+
+typedef opus_int16 opus_val16;
+typedef opus_int32 opus_val32;
+typedef opus_int64 opus_val64;
+
+typedef opus_val32 celt_sig;
+typedef opus_val16 celt_norm;
+typedef opus_val32 celt_ener;
+
+#define Q15ONE 32767
+
+#define SIG_SHIFT 12
+/* Safe saturation value for 32-bit signals. Should be less than
+   2^31*(1-0.85) to avoid blowing up on DC at deemphasis.*/
+#define SIG_SAT (300000000)
+
+#define NORM_SCALING 16384
+
+#define DB_SHIFT 10
+
+#define EPSILON 1
+#define VERY_SMALL 0
+#define VERY_LARGE16 ((opus_val16)32767)
+#define Q15_ONE ((opus_val16)32767)
+
+#define SCALEIN(a)      (a)
+#define SCALEOUT(a)     (a)
+
+#define ABS16(x) ((x) < 0 ? (-(x)) : (x))
+#define ABS32(x) ((x) < 0 ? (-(x)) : (x))
+
+static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
+   return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
+}
+
+#ifdef FIXED_DEBUG
+#include "fixed_debug.h"
+#else
+
+#include "fixed_generic.h"
+
+#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR
+#include "arm/fixed_arm64.h"
+#elif OPUS_ARM_INLINE_EDSP
+#include "arm/fixed_armv5e.h"
+#elif defined (OPUS_ARM_INLINE_ASM)
+#include "arm/fixed_armv4.h"
+#elif defined (BFIN_ASM)
+#include "fixed_bfin.h"
+#elif defined (TI_C5X_ASM)
+#include "fixed_c5x.h"
+#elif defined (TI_C6X_ASM)
+#include "fixed_c6x.h"
+#endif
+
+#endif
+
+#else /* FIXED_POINT */
+
+typedef float opus_val16;
+typedef float opus_val32;
+typedef float opus_val64;
+
+typedef float celt_sig;
+typedef float celt_norm;
+typedef float celt_ener;
+
+#define Q15ONE 1.0f
+
+#define NORM_SCALING 1.f
+
+#define EPSILON 1e-15f
+#define VERY_SMALL 1e-30f
+#define VERY_LARGE16 1e15f
+#define Q15_ONE ((opus_val16)1.f)
+
+/* This appears to be the same speed as C99's fabsf() but it's more portable. */
+#define ABS16(x) ((float)fabs(x))
+#define ABS32(x) ((float)fabs(x))
+
+#define QCONST16(x,bits) (x)
+#define QCONST32(x,bits) (x)
+
+#define NEG16(x) (-(x))
+#define NEG32(x) (-(x))
+#define NEG32_ovflw(x) (-(x))
+#define EXTRACT16(x) (x)
+#define EXTEND32(x) (x)
+#define SHR16(a,shift) (a)
+#define SHL16(a,shift) (a)
+#define SHR32(a,shift) (a)
+#define SHL32(a,shift) (a)
+#define PSHR32(a,shift) (a)
+#define VSHR32(a,shift) (a)
+
+#define PSHR(a,shift)   (a)
+#define SHR(a,shift)    (a)
+#define SHL(a,shift)    (a)
+#define SATURATE(x,a)   (x)
+#define SATURATE16(x)   (x)
+
+#define ROUND16(a,shift)  (a)
+#define SROUND16(a,shift) (a)
+#define HALF16(x)       (.5f*(x))
+#define HALF32(x)       (.5f*(x))
+
+#define ADD16(a,b) ((a)+(b))
+#define SUB16(a,b) ((a)-(b))
+#define ADD32(a,b) ((a)+(b))
+#define SUB32(a,b) ((a)-(b))
+#define ADD32_ovflw(a,b) ((a)+(b))
+#define SUB32_ovflw(a,b) ((a)-(b))
+#define MULT16_16_16(a,b)     ((a)*(b))
+#define MULT16_16(a,b)     ((opus_val32)(a)*(opus_val32)(b))
+#define MAC16_16(c,a,b)     ((c)+(opus_val32)(a)*(opus_val32)(b))
+
+#define MULT16_32_Q15(a,b)     ((a)*(b))
+#define MULT16_32_Q16(a,b)     ((a)*(b))
+
+#define MULT32_32_Q31(a,b)     ((a)*(b))
+
+#define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
+#define MAC16_32_Q16(c,a,b)     ((c)+(a)*(b))
+
+#define MULT16_16_Q11_32(a,b)     ((a)*(b))
+#define MULT16_16_Q11(a,b)     ((a)*(b))
+#define MULT16_16_Q13(a,b)     ((a)*(b))
+#define MULT16_16_Q14(a,b)     ((a)*(b))
+#define MULT16_16_Q15(a,b)     ((a)*(b))
+#define MULT16_16_P15(a,b)     ((a)*(b))
+#define MULT16_16_P13(a,b)     ((a)*(b))
+#define MULT16_16_P14(a,b)     ((a)*(b))
+#define MULT16_32_P16(a,b)     ((a)*(b))
+
+#define DIV32_16(a,b)     (((opus_val32)(a))/(opus_val16)(b))
+#define DIV32(a,b)     (((opus_val32)(a))/(opus_val32)(b))
+
+#define SCALEIN(a)      ((a)*CELT_SIG_SCALE)
+#define SCALEOUT(a)     ((a)*(1/CELT_SIG_SCALE))
+
+#define SIG2WORD16(x) (x)
+
+#endif /* !FIXED_POINT */
+
+#ifndef GLOBAL_STACK_SIZE
+#ifdef FIXED_POINT
+#define GLOBAL_STACK_SIZE 120000
+#else
+#define GLOBAL_STACK_SIZE 120000
+#endif
+#endif
+
+#endif /* ARCH_H */
diff --git a/dnn/autogen.sh b/dnn/autogen.sh
new file mode 100755
index 00000000..a82d6719
--- /dev/null
+++ b/dnn/autogen.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+# Run this to set up the build system: configure, makefiles, etc.
+set -e
+
+srcdir=`dirname $0`
+test -n "$srcdir" && cd "$srcdir"
+
+#SHA1 of the first commit compatible with the current model
+commit=2ddc476
+./download_model.sh $commit
+
+echo "Updating build configuration files for lpcnet, please wait...."
+
+autoreconf -isf
diff --git a/dnn/burg.c b/dnn/burg.c
new file mode 100644
index 00000000..643a0d30
--- /dev/null
+++ b/dnn/burg.c
@@ -0,0 +1,245 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+
+#include "burg.h"
+
+#define MAX_FRAME_SIZE              384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384*/
+#define SILK_MAX_ORDER_LPC          16
+#define FIND_LPC_COND_FAC           1e-5f
+
+/* sum of squares of a silk_float array, with result as double */
+static double silk_energy_FLP(
+    const float    *data,
+    int            dataSize
+)
+{
+    int i;
+    double   result;
+
+    /* 4x unrolled loop */
+    result = 0.0;
+    for( i = 0; i < dataSize - 3; i += 4 ) {
+        result += data[ i + 0 ] * (double)data[ i + 0 ] +
+                  data[ i + 1 ] * (double)data[ i + 1 ] +
+                  data[ i + 2 ] * (double)data[ i + 2 ] +
+                  data[ i + 3 ] * (double)data[ i + 3 ];
+    }
+
+    /* add any remaining products */
+    for( ; i < dataSize; i++ ) {
+        result += data[ i ] * (double)data[ i ];
+    }
+
+    assert( result >= 0.0 );
+    return result;
+}
+
+/* inner product of two silk_float arrays, with result as double */
+static double silk_inner_product_FLP(
+    const float    *data1,
+    const float    *data2,
+    int            dataSize
+)
+{
+    int i;
+    double   result;
+
+    /* 4x unrolled loop */
+    result = 0.0;
+    for( i = 0; i < dataSize - 3; i += 4 ) {
+        result += data1[ i + 0 ] * (double)data2[ i + 0 ] +
+                  data1[ i + 1 ] * (double)data2[ i + 1 ] +
+                  data1[ i + 2 ] * (double)data2[ i + 2 ] +
+                  data1[ i + 3 ] * (double)data2[ i + 3 ];
+    }
+
+    /* add any remaining products */
+    for( ; i < dataSize; i++ ) {
+        result += data1[ i ] * (double)data2[ i ];
+    }
+
+    return result;
+}
+
+
+/* Compute reflection coefficients from input signal */
+float silk_burg_analysis(              /* O    returns residual energy                                     */
+    float          A[],                /* O    prediction coefficients (length order)                      */
+    const float    x[],                /* I    input signal, length: nb_subfr*(D+L_sub)                    */
+    const float    minInvGain,         /* I    minimum inverse prediction gain                             */
+    const int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
+    const int      nb_subfr,           /* I    number of subframes stacked in x                            */
+    const int      D                   /* I    order                                                       */
+)
+{
+    int         k, n, s, reached_max_gain;
+    double           C0, invGain, num, nrg_f, nrg_b, rc, Atmp, tmp1, tmp2;
+    const float *x_ptr;
+    double           C_first_row[ SILK_MAX_ORDER_LPC ], C_last_row[ SILK_MAX_ORDER_LPC ];
+    double           CAf[ SILK_MAX_ORDER_LPC + 1 ], CAb[ SILK_MAX_ORDER_LPC + 1 ];
+    double           Af[ SILK_MAX_ORDER_LPC ];
+
+    assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+
+    /* Compute autocorrelations, added over subframes */
+    C0 = silk_energy_FLP( x, nb_subfr * subfr_length );
+    memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( double ) );
+    for( s = 0; s < nb_subfr; s++ ) {
+        x_ptr = x + s * subfr_length;
+        for( n = 1; n < D + 1; n++ ) {
+            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+        }
+    }
+    memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
+
+    /* Initialize */
+    CAb[ 0 ] = CAf[ 0 ] = C0 + FIND_LPC_COND_FAC * C0 + 1e-9f;
+    invGain = 1.0f;
+    reached_max_gain = 0;
+    for( n = 0; n < D; n++ ) {
+        /* Update first row of correlation matrix (without first element) */
+        /* Update last row of correlation matrix (without last element, stored in reversed order) */
+        /* Update C * Af */
+        /* Update C * flipud(Af) (stored in reversed order) */
+        for( s = 0; s < nb_subfr; s++ ) {
+            x_ptr = x + s * subfr_length;
+            tmp1 = x_ptr[ n ];
+            tmp2 = x_ptr[ subfr_length - n - 1 ];
+            for( k = 0; k < n; k++ ) {
+                C_first_row[ k ] -= x_ptr[ n ] * x_ptr[ n - k - 1 ];
+                C_last_row[ k ]  -= x_ptr[ subfr_length - n - 1 ] * x_ptr[ subfr_length - n + k ];
+                Atmp = Af[ k ];
+                tmp1 += x_ptr[ n - k - 1 ] * Atmp;
+                tmp2 += x_ptr[ subfr_length - n + k ] * Atmp;
+            }
+            for( k = 0; k <= n; k++ ) {
+                CAf[ k ] -= tmp1 * x_ptr[ n - k ];
+                CAb[ k ] -= tmp2 * x_ptr[ subfr_length - n + k - 1 ];
+            }
+        }
+        tmp1 = C_first_row[ n ];
+        tmp2 = C_last_row[ n ];
+        for( k = 0; k < n; k++ ) {
+            Atmp = Af[ k ];
+            tmp1 += C_last_row[  n - k - 1 ] * Atmp;
+            tmp2 += C_first_row[ n - k - 1 ] * Atmp;
+        }
+        CAf[ n + 1 ] = tmp1;
+        CAb[ n + 1 ] = tmp2;
+
+        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
+        num = CAb[ n + 1 ];
+        nrg_b = CAb[ 0 ];
+        nrg_f = CAf[ 0 ];
+        for( k = 0; k < n; k++ ) {
+            Atmp = Af[ k ];
+            num   += CAb[ n - k ] * Atmp;
+            nrg_b += CAb[ k + 1 ] * Atmp;
+            nrg_f += CAf[ k + 1 ] * Atmp;
+        }
+        assert( nrg_f > 0.0 );
+        assert( nrg_b > 0.0 );
+
+        /* Calculate the next order reflection (parcor) coefficient */
+        rc = -2.0 * num / ( nrg_f + nrg_b );
+        assert( rc > -1.0 && rc < 1.0 );
+
+        /* Update inverse prediction gain */
+        tmp1 = invGain * ( 1.0 - rc * rc );
+        if( tmp1 <= minInvGain ) {
+            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
+            rc = sqrt( 1.0 - minInvGain / invGain );
+            if( num > 0 ) {
+                /* Ensure adjusted reflection coefficients has the original sign */
+                rc = -rc;
+            }
+            invGain = minInvGain;
+            reached_max_gain = 1;
+        } else {
+            invGain = tmp1;
+        }
+
+        /* Update the AR coefficients */
+        for( k = 0; k < (n + 1) >> 1; k++ ) {
+            tmp1 = Af[ k ];
+            tmp2 = Af[ n - k - 1 ];
+            Af[ k ]         = tmp1 + rc * tmp2;
+            Af[ n - k - 1 ] = tmp2 + rc * tmp1;
+        }
+        Af[ n ] = rc;
+
+        if( reached_max_gain ) {
+            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
+            for( k = n + 1; k < D; k++ ) {
+                Af[ k ] = 0.0;
+            }
+            break;
+        }
+
+        /* Update C * Af and C * Ab */
+        for( k = 0; k <= n + 1; k++ ) {
+            tmp1 = CAf[ k ];
+            CAf[ k ]          += rc * CAb[ n - k + 1 ];
+            CAb[ n - k + 1  ] += rc * tmp1;
+        }
+    }
+
+    if( reached_max_gain ) {
+        /* Convert to float */
+        for( k = 0; k < D; k++ ) {
+            A[ k ] = (float)( -Af[ k ] );
+        }
+        /* Subtract energy of preceding samples from C0 */
+        for( s = 0; s < nb_subfr; s++ ) {
+            C0 -= silk_energy_FLP( x + s * subfr_length, D );
+        }
+        /* Approximate residual energy */
+        nrg_f = C0 * invGain;
+    } else {
+        /* Compute residual energy and store coefficients as float */
+        nrg_f = CAf[ 0 ];
+        tmp1 = 1.0;
+        for( k = 0; k < D; k++ ) {
+            Atmp = Af[ k ];
+            nrg_f += CAf[ k + 1 ] * Atmp;
+            tmp1  += Atmp * Atmp;
+            A[ k ] = (float)(-Atmp);
+        }
+        nrg_f -= FIND_LPC_COND_FAC * C0 * tmp1;
+    }
+
+    /* Return residual energy */
+    return (float)nrg_f;
+}
diff --git a/dnn/burg.h b/dnn/burg.h
new file mode 100644
index 00000000..f5998d2f
--- /dev/null
+++ b/dnn/burg.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef BURG_H
+#define BURG_H
+
+
+float silk_burg_analysis(              /* O    returns residual energy                                     */
+    float          A[],                /* O    prediction coefficients (length order)                      */
+    const float    x[],                /* I    input signal, length: nb_subfr*(D+L_sub)                    */
+    const float    minInvGain,         /* I    minimum inverse prediction gain                             */
+    const int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
+    const int      nb_subfr,           /* I    number of subframes stacked in x                            */
+    const int      D                   /* I    order                                                       */
+);
+
+#endif
diff --git a/dnn/ceps_vq_train.c b/dnn/ceps_vq_train.c
new file mode 100644
index 00000000..c8e52751
--- /dev/null
+++ b/dnn/ceps_vq_train.c
@@ -0,0 +1,619 @@
+
+#include <valgrind/memcheck.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#define COEF 0.0f
+#define MAX_ENTRIES 16384
+
+#define MULTI 4
+#define MULTI_MASK (MULTI-1)
+
+void compute_weights(const float *x, float *w, int ndim)
+{
+  int i;
+  w[0] = MIN(x[0], x[1]-x[0]);
+  for (i=1;i<ndim-1;i++)
+    w[i] = MIN(x[i]-x[i-1], x[i+1]-x[i]);
+  w[ndim-1] = MIN(x[ndim-1]-x[ndim-2], M_PI-x[ndim-1]);
+  
+  for (i=0;i<ndim;i++)
+    w[i] = 1./(.01+w[i]);
+  w[0]*=3;
+  w[1]*=2;
+  /*
+  for (i=0;i<ndim;i++)
+    w[i] = 1;*/
+}
+
+int find_nearest(const float *codebook, int nb_entries, const float *x, int ndim, float *dist)
+{
+  int i, j;
+  float min_dist = 1e15;
+  int nearest = 0;
+  
+  for (i=0;i<nb_entries;i++)
+  {
+    float dist=0;
+    for (j=0;j<ndim;j++)
+      dist += (x[j]-codebook[i*ndim+j])*(x[j]-codebook[i*ndim+j]);
+    if (dist<min_dist)
+    {
+      min_dist = dist;
+      nearest = i;
+    }
+  }
+  if (dist)
+    *dist = min_dist;
+  return nearest;
+}
+
+int find_nearest_multi(const float *codebook, int nb_entries, const float *x, int ndim, float *dist, int sign)
+{
+  int i, j;
+  float min_dist = 1e15;
+  int nearest = 0;
+
+  for (i=0;i<nb_entries;i++)
+  {
+    int offset;
+    float dist=0;
+    offset = (i&MULTI_MASK)*ndim;
+    for (j=0;j<ndim;j++)
+      dist += (x[offset+j]-codebook[i*ndim+j])*(x[offset+j]-codebook[i*ndim+j]);
+    if (dist<min_dist)
+    {
+      min_dist = dist;
+      nearest = i;
+    }
+  }
+  if (sign) {
+    for (i=0;i<nb_entries;i++)
+    {
+      int offset;
+      float dist=0;
+      offset = (i&MULTI_MASK)*ndim;
+      for (j=0;j<ndim;j++)
+        dist += (x[offset+j]+codebook[i*ndim+j])*(x[offset+j]+codebook[i*ndim+j]);
+      if (dist<min_dist)
+      {
+        min_dist = dist;
+        nearest = i+nb_entries;
+      }
+    }
+  }
+  if (dist)
+    *dist = min_dist;
+  return nearest;
+}
+
+
+int find_nearest_weighted(const float *codebook, int nb_entries, float *x, const float *w, int ndim)
+{
+  int i, j;
+  float min_dist = 1e15;
+  int nearest = 0;
+  
+  for (i=0;i<nb_entries;i++)
+  {
+    float dist=0;
+    for (j=0;j<ndim;j++)
+      dist += w[j]*(x[j]-codebook[i*ndim+j])*(x[j]-codebook[i*ndim+j]);
+    if (dist<min_dist)
+    {
+      min_dist = dist;
+      nearest = i;
+    }
+  }
+  return nearest;
+}
+
+int quantize_lsp(const float *x, const float *codebook1, const float *codebook2, 
+		 const float *codebook3, int nb_entries, float *xq, int ndim)
+{
+  int i, n1, n2, n3;
+  float err[ndim], err2[ndim], err3[ndim];
+  float w[ndim], w2[ndim], w3[ndim];
+  
+  w[0] = MIN(x[0], x[1]-x[0]);
+  for (i=1;i<ndim-1;i++)
+    w[i] = MIN(x[i]-x[i-1], x[i+1]-x[i]);
+  w[ndim-1] = MIN(x[ndim-1]-x[ndim-2], M_PI-x[ndim-1]);
+  
+  /*
+  for (i=0;i<ndim;i++)
+    w[i] = 1./(.003+w[i]);
+  w[0]*=3;
+  w[1]*=2;*/
+  compute_weights(x, w, ndim);
+  
+  for (i=0;i<ndim;i++)
+    err[i] = x[i]-COEF*xq[i];
+  n1 = find_nearest(codebook1, nb_entries, err, ndim, NULL);
+  
+  for (i=0;i<ndim;i++)
+  {
+    xq[i] = COEF*xq[i] + codebook1[ndim*n1+i];
+    err[i] -= codebook1[ndim*n1+i];
+  }
+  for (i=0;i<ndim/2;i++)
+  {
+    err2[i] = err[2*i];  
+    err3[i] = err[2*i+1];
+    w2[i] = w[2*i];  
+    w3[i] = w[2*i+1];
+  }
+  n2 = find_nearest_weighted(codebook2, nb_entries, err2, w2, ndim/2);
+  n3 = find_nearest_weighted(codebook3, nb_entries, err3, w3, ndim/2);
+  
+  for (i=0;i<ndim/2;i++)
+  {
+    xq[2*i] += codebook2[ndim*n2/2+i];
+    xq[2*i+1] += codebook3[ndim*n3/2+i];
+  }
+  return 0;
+}
+
+void split(float *codebook, int nb_entries, int ndim)
+{
+  int i,j;
+  for (i=0;i<nb_entries;i++)
+  {
+    for (j=0;j<ndim;j++)
+    {
+      float delta = .01*(rand()/(float)RAND_MAX-.5);
+      codebook[i*ndim+j] += delta;
+      codebook[(i+nb_entries)*ndim+j] = codebook[i*ndim+j] - delta;
+    }
+  }
+}
+
+
+void split1(float *codebook, int nb_entries, const float *data, int nb_vectors, int ndim)
+{
+  int i,j;
+  int nearest[nb_vectors];
+  float dist[nb_entries];
+  int count[nb_entries];
+  int worst;
+  for (i=0;i<nb_entries;i++)
+    dist[i] = 0;
+  for (i=0;i<nb_entries;i++)
+    count[i]=0;
+  for (i=0;i<nb_vectors;i++)
+  {
+    float d;
+    nearest[i] = find_nearest(codebook, nb_entries, data+i*ndim, ndim, &d);
+    dist[nearest[i]] += d;
+    count[nearest[i]]++;
+  }
+
+  worst=0;
+  for (i=1;i<nb_entries;i++)
+  {
+    if (dist[i] > dist[worst])
+      worst=i;
+  }
+  
+  for (j=0;j<ndim;j++)
+  {
+    float delta = .001*(rand()/(float)RAND_MAX-.5);
+    codebook[worst*ndim+j] += delta;
+    codebook[nb_entries*ndim+j] = codebook[worst*ndim+j] - delta;
+  }
+}
+
+
+
+void update(float *data, int nb_vectors, float *codebook, int nb_entries, int ndim)
+{
+  int i,j;
+  int count[nb_entries];
+  int nearest[nb_vectors];
+  double err=0;
+
+  for (i=0;i<nb_entries;i++)
+    count[i] = 0;
+  
+  for (i=0;i<nb_vectors;i++)
+  {
+    float dist;
+    nearest[i] = find_nearest(codebook, nb_entries, data+i*ndim, ndim, &dist);
+    err += dist;
+  }
+  printf("RMS error = %f\n", sqrt(err/nb_vectors/ndim));
+  for (i=0;i<nb_entries*ndim;i++)
+    codebook[i] = 0;
+  
+  for (i=0;i<nb_vectors;i++)
+  {
+    int n = nearest[i];
+    count[n]++;
+    for (j=0;j<ndim;j++)
+      codebook[n*ndim+j] += data[i*ndim+j];
+  }
+
+  float w2=0;
+  int min_count = 1000000000;
+  int small=0;
+  for (i=0;i<nb_entries;i++)
+  { 
+    for (j=0;j<ndim;j++)
+      codebook[i*ndim+j] *= (1./count[i]);
+    w2 += (count[i]/(float)nb_vectors)*(count[i]/(float)nb_vectors);
+    if (count[i] < min_count) min_count = count[i];
+    small += (count[i] < 50);
+  }
+  fprintf(stderr, "%f / %d, min = %d, small=%d\n", 1./w2, nb_entries, min_count, small);
+}
+
+void update_multi(float *data, int nb_vectors, float *codebook, int nb_entries, int ndim, int sign)
+{
+  int i,j;
+  int count[nb_entries];
+  int idcount[8]={0};
+  int nearest[nb_vectors];
+  double err=0;
+
+  for (i=0;i<nb_entries;i++)
+    count[i] = 0;
+
+  for (i=0;i<nb_vectors;i++)
+  {
+    float dist;
+    nearest[i] = find_nearest_multi(codebook, nb_entries, data+MULTI*i*ndim, ndim, &dist, sign);
+    err += dist;
+  }
+  printf("RMS error = %f\n", sqrt(err/nb_vectors/ndim));
+  for (i=0;i<nb_entries*ndim;i++)
+    codebook[i] = 0;
+
+  for (i=0;i<nb_vectors;i++)
+  {
+    int n = nearest[i] % nb_entries;
+    float sign = nearest[i] < nb_entries ? 1 : -1;
+    count[n]++;
+    idcount[(n&MULTI_MASK) + 4*(sign!=1)]++;
+    for (j=0;j<ndim;j++)
+      codebook[n*ndim+j] += sign*data[(MULTI*i + (n&MULTI_MASK))*ndim+j];
+  }
+
+  float w2=0;
+  int min_count = 1000000000;
+  int small=0;
+  for (i=0;i<nb_entries;i++)
+  {
+    for (j=0;j<ndim;j++)
+      codebook[i*ndim+j] *= (1./count[i]);
+    w2 += (count[i]/(float)nb_vectors)*(count[i]/(float)nb_vectors);
+    if (count[i] < min_count) min_count = count[i];
+    small += (count[i] < 50);
+  }
+  fprintf(stderr, "%d %d %d %d %d %d %d %d ", idcount[0], idcount[1], idcount[2], idcount[3], idcount[4], idcount[5], idcount[6], idcount[7]);
+  fprintf(stderr, "| %f / %d, min = %d, small=%d\n", 1./w2, nb_entries, min_count, small);
+}
+
+
+void update_weighted(float *data, float *weight, int nb_vectors, float *codebook, int nb_entries, int ndim)
+{
+  int i,j;
+  float count[MAX_ENTRIES][ndim];
+  int nearest[nb_vectors];
+  
+  for (i=0;i<nb_entries;i++)
+    for (j=0;j<ndim;j++)
+      count[i][j] = 0;
+  
+  for (i=0;i<nb_vectors;i++)
+  {
+    nearest[i] = find_nearest_weighted(codebook, nb_entries, data+i*ndim, weight+i*ndim, ndim);
+  }
+  for (i=0;i<nb_entries*ndim;i++)
+    codebook[i] = 0;
+  
+  for (i=0;i<nb_vectors;i++)
+  {
+    int n = nearest[i];
+    for (j=0;j<ndim;j++)
+    {
+      float w = sqrt(weight[i*ndim+j]);
+      count[n][j]+=w;
+      codebook[n*ndim+j] += w*data[i*ndim+j];
+    }
+  }
+
+  //float w2=0;
+  for (i=0;i<nb_entries;i++)
+  { 
+    for (j=0;j<ndim;j++)
+      codebook[i*ndim+j] *= (1./count[i][j]);
+    //w2 += (count[i]/(float)nb_vectors)*(count[i]/(float)nb_vectors);
+  }
+  //fprintf(stderr, "%f / %d\n", 1./w2, nb_entries);
+}
+
+void vq_train(float *data, int nb_vectors, float *codebook, int nb_entries, int ndim)
+{
+  int i, j, e;
+  e = 1;
+  for (j=0;j<ndim;j++)
+    codebook[j] = 0;
+  for (i=0;i<nb_vectors;i++)
+    for (j=0;j<ndim;j++)
+      codebook[j] += data[i*ndim+j];
+  for (j=0;j<ndim;j++)
+    codebook[j] *= (1./nb_vectors);
+  
+  
+  while (e< nb_entries)
+  {
+#if 1
+    split(codebook, e, ndim);
+    e<<=1;
+#else
+    split1(codebook, e, data, nb_vectors, ndim);
+    e++;
+#endif
+    fprintf(stderr, "%d\n", e);
+    for (j=0;j<4;j++)
+      update(data, nb_vectors, codebook, e, ndim);
+  }
+  for (j=0;j<20;j++)
+    update(data, nb_vectors, codebook, e, ndim);
+}
+
+void vq_train_multi(float *data, int nb_vectors, float *codebook, int nb_entries, int ndim, int sign)
+{
+  int i, j, e;
+#if 1
+  for (e=0;e<MULTI;e++) {
+    for (j=0;j<ndim;j++)
+      codebook[e*ndim+j] = 0;
+    for (i=0;i<nb_vectors;i++)
+      for (j=0;j<ndim;j++)
+        codebook[e*ndim+j] += data[(MULTI*i+e)*ndim+j];
+    for (j=0;j<ndim;j++) {
+      float delta = .01*(rand()/(float)RAND_MAX-.5);
+      codebook[e*ndim+j] *= (1./nb_vectors);
+      codebook[e*ndim+j] += delta;
+    }
+  }
+#else
+  for (i=0;i<MULTI*ndim;i++) codebook[i] = .01*(rand()/(float)RAND_MAX-.5);
+#endif
+  e = MULTI;
+  for (j=0;j<10;j++)
+    update_multi(data, nb_vectors, codebook, e, ndim, sign);
+
+  while (e < nb_entries)
+  {
+    split(codebook, e, ndim);
+    e<<=1;
+    fprintf(stderr, "%d\n", e);
+    for (j=0;j<4;j++)
+      update_multi(data, nb_vectors, codebook, e, ndim, sign);
+  }
+  for (j=0;j<20;j++)
+    update_multi(data, nb_vectors, codebook, e, ndim, sign);
+}
+
+
+void vq_train_weighted(float *data, float *weight, int nb_vectors, float *codebook, int nb_entries, int ndim)
+{
+  int i, j, e;
+  e = 1;
+  for (j=0;j<ndim;j++)
+    codebook[j] = 0;
+  for (i=0;i<nb_vectors;i++)
+    for (j=0;j<ndim;j++)
+      codebook[j] += data[i*ndim+j];
+  for (j=0;j<ndim;j++)
+    codebook[j] *= (1./nb_vectors);
+  
+  
+  while (e< nb_entries)
+  {
+#if 0
+    split(codebook, e, ndim);
+    e<<=1;
+#else
+    split1(codebook, e, data, nb_vectors, ndim);
+    e++;
+#endif
+    fprintf(stderr, "%d\n", e);
+    for (j=0;j<ndim;j++)
+      update_weighted(data, weight, nb_vectors, codebook, e, ndim);
+  }
+}
+
+
+int main(int argc, char **argv)
+{
+  int i,j;
+  int nb_vectors, nb_entries, nb_entries1, nb_entries2a, nb_entries2b, ndim, ndim0, total_dim;
+  float *data, *pred, *multi_data, *multi_data2, *qdata;
+  float *codebook, *codebook2, *codebook3, *codebook_diff2, *codebook_diff4;
+  float *delta;
+  double err;
+  FILE *fout;
+  
+  ndim = atoi(argv[1]);
+  ndim0 = ndim-1;
+  total_dim = atoi(argv[2]);
+  nb_vectors = atoi(argv[3]);
+  nb_entries = 1<<atoi(argv[4]);
+  nb_entries1 = 1024;
+  nb_entries2a = 4096;
+  nb_entries2b = 64;
+  
+  data = malloc((nb_vectors*ndim+total_dim)*sizeof(*data));
+  qdata = malloc((nb_vectors*ndim+total_dim)*sizeof(*qdata));
+  pred = malloc(nb_vectors*ndim0*sizeof(*pred));
+  multi_data = malloc(MULTI*nb_vectors*ndim*sizeof(*multi_data));
+  multi_data2 = malloc(MULTI*nb_vectors*ndim*sizeof(*multi_data));
+  codebook = malloc(nb_entries*ndim0*sizeof(*codebook));
+  codebook2 = malloc(nb_entries1*ndim0*sizeof(*codebook2));
+  codebook3 = malloc(nb_entries1*ndim0*sizeof(*codebook3));
+  codebook_diff4 = malloc(nb_entries2a*ndim*sizeof(*codebook_diff4));
+  codebook_diff2 = malloc(nb_entries2b*ndim*sizeof(*codebook_diff2));
+  
+  for (i=0;i<nb_vectors;i++)
+  {
+    fread(&data[i*ndim], sizeof(float), total_dim, stdin);
+    if (feof(stdin))
+      break;
+  }
+  nb_vectors = i;
+  VALGRIND_CHECK_MEM_IS_DEFINED(data, nb_entries*ndim);
+
+  for (i=0;i<4;i++)
+  {
+    for (j=0;j<ndim0;j++)
+      pred[i*ndim0+j] = 0;
+  }
+  for (i=4;i<nb_vectors;i++)
+  {
+    for (j=0;j<ndim0;j++)
+      pred[i*ndim0+j] = data[i*ndim+j+1] - COEF*data[(i-4)*ndim+j+1];
+  }
+#if 1
+  VALGRIND_CHECK_MEM_IS_DEFINED(pred, nb_entries*ndim0);
+  vq_train(pred, nb_vectors, codebook, nb_entries, ndim0);
+  
+  delta = malloc(nb_vectors*ndim0*sizeof(*data));
+  err = 0;
+  for (i=0;i<nb_vectors;i++)
+  {
+    int nearest = find_nearest(codebook, nb_entries, &pred[i*ndim0], ndim0, NULL);
+    qdata[i*ndim] = data[i*ndim];
+    for (j=0;j<ndim0;j++)
+    {
+      qdata[i*ndim+j+1] = codebook[nearest*ndim0+j];
+      delta[i*ndim0+j] = pred[i*ndim0+j] - codebook[nearest*ndim0+j];
+      err += delta[i*ndim0+j]*delta[i*ndim0+j];
+    }
+    //printf("\n");
+  }
+  fprintf(stderr, "Cepstrum RMS error: %f\n", sqrt(err/nb_vectors/ndim));
+
+  vq_train(delta, nb_vectors, codebook2, nb_entries1, ndim0);
+  
+  err=0;
+  for (i=0;i<nb_vectors;i++)
+  {
+    int n1;
+    n1 = find_nearest(codebook2, nb_entries1, &delta[i*ndim0], ndim0, NULL);
+    for (j=0;j<ndim0;j++)
+    {
+      qdata[i*ndim+j+1] += codebook2[n1*ndim0+j];
+      //delta[i*ndim0+j] = delta[i*ndim0+j] - codebook2[n1*ndim0+j];
+      delta[i*ndim0+j] = data[i*ndim+j+1] - qdata[i*ndim+j+1];
+      err += delta[i*ndim0+j]*delta[i*ndim0+j];
+    }
+  }
+  fprintf(stderr, "Cepstrum RMS error after stage 2: %f)\n", sqrt(err/nb_vectors/ndim));
+
+  vq_train(delta, nb_vectors, codebook3, nb_entries1, ndim0);
+  err=0;
+  for (i=0;i<nb_vectors;i++)
+  {
+    int n1;
+    n1 = find_nearest(codebook3, nb_entries1, &delta[i*ndim0], ndim0, NULL);
+    for (j=0;j<ndim0;j++)
+    {
+      qdata[i*ndim+j+1] += codebook3[n1*ndim0+j];
+      //delta[i*ndim0+j] = delta[i*ndim0+j] - codebook2[n1*ndim0+j];
+      delta[i*ndim0+j] = data[i*ndim+j+1] - qdata[i*ndim+j+1];
+      err += delta[i*ndim0+j]*delta[i*ndim0+j];
+    }
+  }
+  fprintf(stderr, "Cepstrum RMS error after stage 3: %f)\n", sqrt(err/nb_vectors/ndim));
+#else
+  qdata = data;
+#endif
+  for (i=0;i<nb_vectors-4;i++)
+  {
+    for (j=0;j<ndim;j++)
+      multi_data[MULTI*i*ndim+j]     = data[(i+1)*ndim+j] - .5*(qdata[i*ndim+j]+qdata[(i+2)*ndim+j]);
+    for (j=0;j<ndim;j++)
+      multi_data[(MULTI*i+1)*ndim+j] = data[(i+1)*ndim+j] - .5*(qdata[i*ndim+j]+qdata[(i+2)*ndim+j]);
+    for (j=0;j<ndim;j++)
+      multi_data[(MULTI*i+2)*ndim+j] = data[(i+1)*ndim+j] - qdata[i*ndim+j];
+    for (j=0;j<ndim;j++)
+      multi_data[(MULTI*i+3)*ndim+j] = data[(i+1)*ndim+j] - qdata[(i+2)*ndim+j];
+    //for (j=0;j<4*ndim;j++) printf("%f ", multi_data[MULTI*i*ndim + j]);
+    //printf("\n");
+  }
+
+  for (i=0;i<nb_vectors-4;i++)
+  {
+    for (j=0;j<ndim;j++)
+      multi_data2[MULTI*i*ndim+j]     = data[(i+2)*ndim+j] - .5*(qdata[i*ndim+j]+qdata[(i+4)*ndim+j]);
+    for (j=0;j<ndim;j++)
+      multi_data2[(MULTI*i+1)*ndim+j] = data[(i+2)*ndim+j] - .5*(qdata[i*ndim+j]+qdata[(i+4)*ndim+j]);
+    for (j=0;j<ndim;j++)
+      multi_data2[(MULTI*i+2)*ndim+j] = data[(i+2)*ndim+j] - qdata[i*ndim+j];
+    for (j=0;j<ndim;j++)
+      multi_data2[(MULTI*i+3)*ndim+j] = data[(i+2)*ndim+j] - qdata[(i+4)*ndim+j];
+  }
+
+  vq_train_multi(multi_data2, nb_vectors-4, codebook_diff4, nb_entries2a, ndim, 1);
+
+  printf("done\n");
+  vq_train_multi(multi_data, nb_vectors-4, codebook_diff2, nb_entries2b, ndim, 0);
+
+
+  fout = fopen("ceps_codebooks.c", "w");
+  fprintf(fout, "/* This file is automatically generated */\n\n");
+  fprintf(fout, "float ceps_codebook1[%d*%d] = {\n",nb_entries, ndim0);
+  
+  for (i=0;i<nb_entries;i++)
+  {
+    for (j=0;j<ndim0;j++)
+      fprintf(fout, "%f, ", codebook[i*ndim0+j]);
+    fprintf(fout, "\n");
+  }
+  fprintf(fout, "};\n\n");
+
+  fprintf(fout, "float ceps_codebook2[%d*%d] = {\n",nb_entries1, ndim0);
+  for (i=0;i<nb_entries1;i++)
+  {
+    for (j=0;j<ndim0;j++)
+      fprintf(fout, "%f, ", codebook2[i*ndim0+j]);
+    fprintf(fout, "\n");
+  }
+  fprintf(fout, "};\n\n");
+
+  fprintf(fout, "float ceps_codebook3[%d*%d] = {\n",nb_entries1, ndim0);
+  for (i=0;i<nb_entries1;i++)
+  {
+    for (j=0;j<ndim0;j++)
+      fprintf(fout, "%f, ", codebook3[i*ndim0+j]);
+    fprintf(fout, "\n");
+  }
+  fprintf(fout, "};\n\n");
+
+  fprintf(fout, "float ceps_codebook_diff4[%d*%d] = {\n",nb_entries2a, ndim);
+  for (i=0;i<nb_entries2a;i++)
+  {
+    for (j=0;j<ndim;j++)
+      fprintf(fout, "%f, ", codebook_diff4[i*ndim+j]);
+    fprintf(fout, "\n");
+  }
+  fprintf(fout, "};\n\n");
+
+  fprintf(fout, "float ceps_codebook_diff2[%d*%d] = {\n",nb_entries2b, ndim);
+  for (i=0;i<nb_entries2b;i++)
+  {
+    for (j=0;j<ndim;j++)
+      fprintf(fout, "%f, ", codebook_diff2[i*ndim+j]);
+    fprintf(fout, "\n");
+  }
+  fprintf(fout, "};\n\n");
+  
+  fclose(fout);
+  return 0;
+}
diff --git a/dnn/common.c b/dnn/common.c
new file mode 100644
index 00000000..9fae8b3b
--- /dev/null
+++ b/dnn/common.c
@@ -0,0 +1,65 @@
+
+/* Copyright (c) 2017-2019 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "freq.h"
+#include "lpcnet_private.h"
+
+
+static void single_interp(float *x, const float *left, const float *right, int id)
+{
+    int i;
+    float ref[NB_BANDS];
+    float pred[3*NB_BANDS];
+    RNN_COPY(ref, x, NB_BANDS);
+    for (i=0;i<NB_BANDS;i++) pred[i] = .5f*(left[i] + right[i]);
+    for (i=0;i<NB_BANDS;i++) pred[NB_BANDS+i] = left[i];
+    for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = right[i];
+    for (i=0;i<NB_BANDS;i++) {
+      x[i] = pred[id*NB_BANDS + i];
+    }
+    if (0) {
+        float err = 0;
+        for (i=0;i<NB_BANDS;i++) {
+            err += (x[i]-ref[i])*(x[i]-ref[i]);
+        }
+        printf("%f\n", sqrt(err/NB_BANDS));
+    }
+}
+
+void perform_double_interp(float features[4][NB_TOTAL_FEATURES], const float *mem, int best_id) {
+    int id0, id1;
+    best_id += (best_id >= FORBIDDEN_INTERP);
+    id0 = best_id / 3;
+    id1 = best_id % 3;
+    single_interp(features[0], mem, features[1], id0);
+    single_interp(features[2], features[1], features[3], id1);
+}
diff --git a/dnn/common.h b/dnn/common.h
new file mode 100644
index 00000000..e9c8dd17
--- /dev/null
+++ b/dnn/common.h
@@ -0,0 +1,96 @@
+
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#define RNN_INLINE inline
+#ifndef OPUS_INLINE
+#define OPUS_INLINE inline
+#endif
+
+float lpc_from_cepstrum(float *lpc, const float *cepstrum);
+
+#define LOG256 5.5451774445f
+static RNN_INLINE float log2_approx(float x)
+{
+   int integer;
+   float frac;
+   union {
+      float f;
+      int i;
+   } in;
+   in.f = x;
+   integer = (in.i>>23)-127;
+   in.i -= integer<<23;
+   frac = in.f - 1.5f;
+   frac = -0.41445418f + frac*(0.95909232f
+          + frac*(-0.33951290f + frac*0.16541097f));
+   return 1+integer+frac;
+}
+
+#define log_approx(x) (0.69315f*log2_approx(x))
+
+static RNN_INLINE float ulaw2lin(float u)
+{
+    float s;
+    float scale_1 = 32768.f/255.f;
+    u = u - 128.f;
+    s = u >= 0.f ? 1.f : -1.f;
+    u = fabs(u);
+    return s*scale_1*(exp(u/128.*LOG256)-1);
+}
+
+static RNN_INLINE int lin2ulaw(float x)
+{
+    float u;
+    float scale = 255.f/32768.f;
+    int s = x >= 0 ? 1 : -1;
+    x = fabs(x);
+    u = (s*(128*log_approx(1+scale*x)/LOG256));
+    u = 128 + u;
+    if (u < 0) u = 0;
+    if (u > 255) u = 255;
+    return (int)floor(.5 + u);
+}
+
+
+/** RNNoise wrapper for malloc(). To do your own dynamic allocation, all you need t
+o do is replace this function and rnnoise_free */
+#ifndef OVERRIDE_RNNOISE_ALLOC
+static RNN_INLINE void *rnnoise_alloc (size_t size)
+{
+   return malloc(size);
+}
+#endif
+
+/** RNNoise wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and rnnoise_alloc */
+#ifndef OVERRIDE_RNNOISE_FREE
+static RNN_INLINE void rnnoise_free (void *ptr)
+{
+   free(ptr);
+}
+#endif
+
+/** Copy n elements from src to dst. The 0* term provides compile-time type checking  */
+#ifndef OVERRIDE_RNN_COPY
+#define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
+#endif
+
+/** Copy n elements from src to dst, allowing overlapping regions. The 0* term
+    provides compile-time type checking */
+#ifndef OVERRIDE_RNN_MOVE
+#define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
+#endif
+
+/** Set n elements of dst to zero */
+#ifndef OVERRIDE_RNN_CLEAR
+#define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
+#endif
+
+
+
+#endif
diff --git a/dnn/compile.sh b/dnn/compile.sh
new file mode 100755
index 00000000..327f23fa
--- /dev/null
+++ b/dnn/compile.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+gcc -Wall -W -O3 -g -I../include dump_data.c freq.c kiss_fft.c pitch.c celt_lpc.c -o dump_data -lm
+gcc -o test_lpcnet -mavx2 -mfma -g -O3 -Wall -W -Wextra test_lpcnet.c lpcnet.c nnet.c nnet_data.c freq.c kiss_fft.c pitch.c celt_lpc.c -lm
diff --git a/dnn/concat.sh b/dnn/concat.sh
new file mode 100644
index 00000000..83691170
--- /dev/null
+++ b/dnn/concat.sh
@@ -0,0 +1,6 @@
+# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
+# into one headerless training file
+for i in */*.wav
+do
+sox $i -r 16000 -c 1 -t sw -
+done > input.s16
diff --git a/dnn/configure.ac b/dnn/configure.ac
new file mode 100644
index 00000000..fad54e27
--- /dev/null
+++ b/dnn/configure.ac
@@ -0,0 +1,131 @@
+# autoconf source script for generating configure
+
+dnl The package_version file will be automatically synced to the git revision
+dnl by the update_version script when configured in the repository, but will
+dnl remain constant in tarball releases unless it is manually edited.
+m4_define([CURRENT_VERSION],
+          m4_esyscmd([ ./update_version 2>/dev/null || true
+                       if test -e package_version; then
+                           . ./package_version
+                           printf "$PACKAGE_VERSION"
+                       else
+                           printf "unknown"
+                       fi ]))
+
+AC_INIT([lpcnet],[CURRENT_VERSION],[jmvalin@jmvalin.ca])
+AC_CONFIG_SRCDIR([lpcnet.c])
+AC_CONFIG_MACRO_DIR([m4])
+
+AC_USE_SYSTEM_EXTENSIONS
+AC_SYS_LARGEFILE
+
+AM_INIT_AUTOMAKE([1.11 foreign no-define dist-zip subdir-objects])
+AM_MAINTAINER_MODE([enable])
+
+AC_C_INLINE
+
+LT_INIT
+
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AC_DEFINE([LPCNET_BUILD], [], [This is a build of the library])
+
+dnl Library versioning for libtool.
+dnl Please update these for releases.
+dnl CURRENT, REVISION, AGE
+dnl - library source changed -> increment REVISION
+dnl - interfaces added/removed/changed -> increment CURRENT, REVISION = 0
+dnl - interfaces added -> increment AGE
+dnl - interfaces removed -> AGE = 0
+
+OP_LT_CURRENT=0
+OP_LT_REVISION=0
+OP_LT_AGE=0
+
+AC_SUBST(OP_LT_CURRENT)
+AC_SUBST(OP_LT_REVISION)
+AC_SUBST(OP_LT_AGE)
+
+CC_CHECK_CFLAGS_APPEND(
+  [-pedantic -Wall -Wextra -Wno-sign-compare -Wno-parentheses -Wno-long-long])
+
+# Platform-specific tweaks
+case $host in
+  *-mingw*)
+    # -std=c89 causes some warnings under mingw.
+    CC_CHECK_CFLAGS_APPEND([-U__STRICT_ANSI__])
+    # We need WINNT>=0x501 (WindowsXP) for getaddrinfo/freeaddrinfo.
+    # It's okay to define this even when HTTP support is disabled, as it only
+    #  affects header declarations, not linking (unless we actually use some
+    #  XP-only functions).
+    AC_DEFINE_UNQUOTED(_WIN32_WINNT,0x501,
+     [We need at least WindowsXP for getaddrinfo/freeaddrinfo])
+    host_mingw=true
+    ;;
+esac
+AM_CONDITIONAL(OP_WIN32, test "$host_mingw" = "true")
+
+AC_ARG_ENABLE([assertions],
+  AS_HELP_STRING([--enable-assertions], [Enable assertions in code]),,
+  enable_assertions=no)
+
+AS_IF([test "$enable_assertions" = "yes"], [
+  AC_DEFINE([OP_ENABLE_ASSERTIONS], [1], [Enable assertions in code])
+])
+
+AC_ARG_ENABLE([dot-product],
+	      AS_HELP_STRING([--disable-dot-product], [Disable dot product implementation]),,
+  enable_dot_product=yes)
+
+AS_IF([test "$enable_dot_product" = "no"], [
+       AC_DEFINE([DISABLE_DOT_PROD], [1], [Disable dot product instructions])
+])
+
+AS_CASE(["$ac_cv_search_lrintf"],
+  ["no"],[],
+  ["none required"],[],
+  [lrintf_lib="$ac_cv_search_lrintf"])
+
+LT_LIB_M
+
+AC_SUBST([lrintf_lib])
+
+CC_ATTRIBUTE_VISIBILITY([default], [
+  CC_FLAG_VISIBILITY([CFLAGS="${CFLAGS} -fvisibility=hidden"])
+])
+
+dnl Check for doxygen
+AC_ARG_ENABLE([doc],
+  AS_HELP_STRING([--disable-doc], [Do not build API documentation]),,
+  [enable_doc=yes]
+)
+
+AS_IF([test "$enable_doc" = "yes"], [
+  AC_CHECK_PROG([HAVE_DOXYGEN], [doxygen], [yes], [no])
+  AC_CHECK_PROG([HAVE_DOT], [dot], [yes], [no])
+],[
+  HAVE_DOXYGEN=no
+])
+
+AM_CONDITIONAL([HAVE_DOXYGEN], [test "$HAVE_DOXYGEN" = "yes"])
+
+AC_CONFIG_FILES([
+  Makefile
+  lpcnet.pc
+  lpcnet-uninstalled.pc
+  doc/Doxyfile
+])
+AC_CONFIG_HEADERS([config.h])
+AC_OUTPUT
+
+AC_MSG_NOTICE([
+------------------------------------------------------------------------
+  $PACKAGE_NAME $PACKAGE_VERSION: Automatic configuration OK.
+
+    Dot product intrinsics ....... ${enable_dot_product}
+    Assertions ................... ${enable_assertions}
+    Hidden visibility ............ ${cc_cv_flag_visibility}
+
+    API documentation ............ ${enable_doc}
+------------------------------------------------------------------------
+])
diff --git a/dnn/datasets.txt b/dnn/datasets.txt
new file mode 100644
index 00000000..160bc316
--- /dev/null
+++ b/dnn/datasets.txt
@@ -0,0 +1,174 @@
+The following datasets can be used to train a language-independent LPCNet model.
+A good choice is to include all the data from these datasets, except for
+hi_fi_tts for which only a small subset is recommended (since it's very large
+but has few speakers). Note that this data typically needs to be resampled
+before it can be used.
+
+https://www.openslr.org/resources/30/si_lk.tar.gz
+https://www.openslr.org/resources/32/af_za.tar.gz
+https://www.openslr.org/resources/32/st_za.tar.gz
+https://www.openslr.org/resources/32/tn_za.tar.gz
+https://www.openslr.org/resources/32/xh_za.tar.gz
+https://www.openslr.org/resources/37/bn_bd.zip
+https://www.openslr.org/resources/37/bn_in.zip
+https://www.openslr.org/resources/41/jv_id_female.zip
+https://www.openslr.org/resources/41/jv_id_male.zip
+https://www.openslr.org/resources/42/km_kh_male.zip
+https://www.openslr.org/resources/43/ne_np_female.zip
+https://www.openslr.org/resources/44/su_id_female.zip
+https://www.openslr.org/resources/44/su_id_male.zip
+https://www.openslr.org/resources/61/es_ar_female.zip
+https://www.openslr.org/resources/61/es_ar_male.zip
+https://www.openslr.org/resources/63/ml_in_female.zip
+https://www.openslr.org/resources/63/ml_in_male.zip
+https://www.openslr.org/resources/64/mr_in_female.zip
+https://www.openslr.org/resources/65/ta_in_female.zip
+https://www.openslr.org/resources/65/ta_in_male.zip
+https://www.openslr.org/resources/66/te_in_female.zip
+https://www.openslr.org/resources/66/te_in_male.zip
+https://www.openslr.org/resources/69/ca_es_female.zip
+https://www.openslr.org/resources/69/ca_es_male.zip
+https://www.openslr.org/resources/70/en_ng_female.zip
+https://www.openslr.org/resources/70/en_ng_male.zip
+https://www.openslr.org/resources/71/es_cl_female.zip
+https://www.openslr.org/resources/71/es_cl_male.zip
+https://www.openslr.org/resources/72/es_co_female.zip
+https://www.openslr.org/resources/72/es_co_male.zip
+https://www.openslr.org/resources/73/es_pe_female.zip
+https://www.openslr.org/resources/73/es_pe_male.zip
+https://www.openslr.org/resources/74/es_pr_female.zip
+https://www.openslr.org/resources/75/es_ve_female.zip
+https://www.openslr.org/resources/75/es_ve_male.zip
+https://www.openslr.org/resources/76/eu_es_female.zip
+https://www.openslr.org/resources/76/eu_es_male.zip
+https://www.openslr.org/resources/77/gl_es_female.zip
+https://www.openslr.org/resources/77/gl_es_male.zip
+https://www.openslr.org/resources/78/gu_in_female.zip
+https://www.openslr.org/resources/78/gu_in_male.zip
+https://www.openslr.org/resources/79/kn_in_female.zip
+https://www.openslr.org/resources/79/kn_in_male.zip
+https://www.openslr.org/resources/80/my_mm_female.zip
+https://www.openslr.org/resources/83/irish_english_male.zip
+https://www.openslr.org/resources/83/midlands_english_female.zip
+https://www.openslr.org/resources/83/midlands_english_male.zip
+https://www.openslr.org/resources/83/northern_english_female.zip
+https://www.openslr.org/resources/83/northern_english_male.zip
+https://www.openslr.org/resources/83/scottish_english_female.zip
+https://www.openslr.org/resources/83/scottish_english_male.zip
+https://www.openslr.org/resources/83/southern_english_female.zip
+https://www.openslr.org/resources/83/southern_english_male.zip
+https://www.openslr.org/resources/83/welsh_english_female.zip
+https://www.openslr.org/resources/83/welsh_english_male.zip
+https://www.openslr.org/resources/86/yo_ng_female.zip
+https://www.openslr.org/resources/86/yo_ng_male.zip
+https://www.openslr.org/resources/109/hi_fi_tts_v0.tar.gz
+
+The corresponding citations for all these datasets are:
+
+  @inproceedings{demirsahin-etal-2020-open,
+    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},
+    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},
+    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
+    month = may,
+    year = {2020},
+    pages = {6532--6541},
+    address = {Marseille, France},
+    publisher = {European Language Resources Association (ELRA)},
+    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},
+    ISBN = {979-10-95546-34-4},
+  }
+  @inproceedings{kjartansson-etal-2020-open,
+    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
+    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
+    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
+    year = {2020},
+    pages = {21--27},
+    month = may,
+    address = {Marseille, France},
+    publisher = {European Language Resources association (ELRA)},
+    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
+    ISBN = {979-10-95546-35-1},
+  }
+
+
+  @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
+    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
+    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
+    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
+    year = {2020},
+    month = may,
+    address = {Marseille, France},
+    publisher = {European Language Resources Association (ELRA)},
+    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
+    pages = {6504--6513},
+    ISBN = {979-10-95546-34-4},
+  }
+  @inproceedings{he-etal-2020-open,
+    title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
+    author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
+    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
+    month = may,
+    year = {2020},
+    address = {Marseille, France},
+    publisher = {European Language Resources Association (ELRA)},
+    pages = {6494--6503},
+    url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
+    ISBN = "{979-10-95546-34-4}",
+  }
+
+
+  @inproceedings{kjartansson-etal-tts-sltu2018,
+    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
+    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
+    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
+    year  = {2018},
+    address = {Gurugram, India},
+    month = aug,
+    pages = {66--70},
+    URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
+  }
+
+
+  @inproceedings{oo-etal-2020-burmese,
+    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application to Text-to-Speech}},
+    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},
+    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
+    month = may,
+    year = {2020},
+    pages = "6328--6339",
+    address = {Marseille, France},
+    publisher = {European Language Resources Association (ELRA)},
+    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},
+    ISBN = {979-10-95546-34-4},
+  }
+  @inproceedings{van-niekerk-etal-2017,
+    title = {{Rapid development of TTS corpora for four South African languages}},
+    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson and Martin Jansche and Linne Ha},
+    booktitle = {Proc. Interspeech 2017},
+    pages = {2178--2182},
+    address = {Stockholm, Sweden},
+    month = aug,
+    year  = {2017},
+    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}
+  }
+
+  @inproceedings{gutkin-et-al-yoruba2020,
+    title = {{Developing an Open-Source Corpus of Yoruba Speech}},
+    author = {Alexander Gutkin and I{\c{s}}{\i}n Demir{\c{s}}ahin and Oddur Kjartansson and Clara Rivera and K\d{\'o}lá Túb\d{\`o}sún},
+    booktitle = {Proceedings of Interspeech 2020},
+    pages = {404--408},
+    month = {October},
+    year = {2020},
+    address = {Shanghai, China},
+    publisher = {International Speech and Communication Association (ISCA)},
+    doi = {10.21437/Interspeech.2020-1096},
+    url = {http://dx.doi.org/10.21437/Interspeech.2020-1096},
+  }
+
+@article{bakhturina2021hi,
+  title={{Hi-Fi Multi-Speaker English TTS Dataset}},
+  author={Bakhturina, Evelina and Lavrukhin, Vitaly and Ginsburg, Boris and Zhang, Yang},
+  journal={arXiv preprint arXiv:2104.01497},
+  year={2021}
+}
+
diff --git a/dnn/doc/Doxyfile.in b/dnn/doc/Doxyfile.in
new file mode 100644
index 00000000..119b60da
--- /dev/null
+++ b/dnn/doc/Doxyfile.in
@@ -0,0 +1,18 @@
+# Process with doxygen to generate API documentation
+
+PROJECT_NAME           = @PACKAGE_NAME@
+PROJECT_NUMBER         = @PACKAGE_VERSION@
+PROJECT_BRIEF          = "Neural Vocoder."
+INPUT                  = @top_srcdir@/include/lpcnet.h
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = YES
+
+JAVADOC_AUTOBRIEF      = YES
+SORT_MEMBER_DOCS       = NO
+
+HAVE_DOT               = @HAVE_DOT@
diff --git a/dnn/doc/Makefile b/dnn/doc/Makefile
new file mode 100644
index 00000000..5fc42866
--- /dev/null
+++ b/dnn/doc/Makefile
@@ -0,0 +1,68 @@
+# Makefile for LPCNet
+
+CC=gcc
+CFLAGS+=-Wall -W -Wextra -Wno-unused-function -O3 -g -Iinclude 
+
+AVX2:=$(shell cat /proc/cpuinfo | grep -c avx2)
+AVX:=$(shell cat /proc/cpuinfo | grep -c avx)
+NEON:=$(shell cat /proc/cpuinfo | grep -c neon)
+
+ifneq ($(AVX2),0)
+CFLAGS+=-mavx2 -mfma 
+else
+# AVX2 machines will also match on AVX
+ifneq ($(AVX),0)
+CFLAGS+=-mavx
+endif
+endif
+
+ifneq ($(NEON),0)
+CFLAGS+=-mfpu=neon -march=armv8-a -mtune=cortex-a53
+endif
+
+all: dump_data lpcnet_demo test_lpcnet test_vec
+
+lpcnet_data-8ea8396.tar.gz:
+	wget https://media.xiph.org/lpcnet/data/lpcnet_data-8ea8396.tar.gz
+
+download: src/ceps_codebooks.c src/nnet_data.c
+
+src/ceps_codebooks.c: lpcnet_data-8ea8396.tar.gz
+	tar xvf lpcnet_data-8ea8396.tar.gz
+
+dump_data_objs := src/common.o src/dump_data.o src/freq.o src/kiss_fft.o src/pitch.o src/celt_lpc.o src/lpcnet_dec.o src/lpcnet_enc.o src/ceps_codebooks.o
+dump_data_deps := $(dump_data_objs:.o=.d)
+dump_data: download $(dump_data_objs)
+	gcc -o $@ $(CFLAGS) $(dump_data_objs) -lm
+
+-include $dump_data_deps(_deps)
+
+test_lpcnet_objs := src/common.o src/test_lpcnet.o src/lpcnet.o src/nnet.o src/nnet_data.o src/freq.o src/kiss_fft.o src/pitch.o src/celt_lpc.o src/lpcnet_dec.o  src/ceps_codebooks.o
+test_lpcnet_deps := $(test_lpcnet_objs:.o=.d)
+test_lpcnet: download $(test_lpcnet_objs)
+	gcc -o $@ $(CFLAGS) $(test_lpcnet_objs) -lm
+
+-include $(test_lpcnet_deps)
+
+lpcnet_demo_objs := src/common.o src/lpcnet_demo.o src/lpcnet.o src/nnet.o src/nnet_data.o src/freq.o src/kiss_fft.o src/pitch.o src/celt_lpc.o src/lpcnet_dec.o  src/ceps_codebooks.o src/lpcnet_enc.o
+lpcnet_demo_deps := $(lpcnet_demo_objs:.o=.d)
+lpcnet_demo: download $(lpcnet_demo_objs)
+	gcc -o $@ $(CFLAGS) $(lpcnet_demo_objs) -lm
+
+-include $(lpcnet_demo_deps)
+
+test_vec_objs := src/test_vec.o
+test_vec_deps := $(test_vec_objs:.o=.d)
+test_vec: $(test_vec_objs)
+	gcc -o $@ $(CFLAGS) $(test_vec_objs) -lm
+
+-include $(test_vec_deps)
+
+test: test_vec
+	./test_vec
+
+clean:
+	rm -f dump_data test_lpcnet test_vec
+	rm -f $(dump_data_objs) $(dump_data_deps) 
+	rm -f $(test_lpcnet_objs) $(test_lpcnet_deps) 
+	rm -f $(test_vec_objs) $(test_vec_deps) 
diff --git a/dnn/download_model.bat b/dnn/download_model.bat
new file mode 100644
index 00000000..3d9e5575
--- /dev/null
+++ b/dnn/download_model.bat
@@ -0,0 +1,10 @@
+@echo off
+set model=lpcnet_data-%1.tar.gz
+
+if not exist %model% (
+    echo Downloading latest model
+    powershell -Command "(New-Object System.Net.WebClient).DownloadFile('https://media.xiph.org/lpcnet/data/%model%', '%model%')"
+)
+
+tar -xvzf %model%
+
diff --git a/dnn/download_model.sh b/dnn/download_model.sh
new file mode 100755
index 00000000..b5fbf000
--- /dev/null
+++ b/dnn/download_model.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+set -e
+
+model=lpcnet_data-$1.tar.gz
+
+if [ ! -f $model ]; then
+        echo "Downloading latest model"
+        wget https://media.xiph.org/lpcnet/data/$model
+fi
+tar xvof $model
+touch src/nnet_data.[ch]
+touch src/plc_data.[ch]
+mv src/*.[ch] .
diff --git a/dnn/dred_rdovae.c b/dnn/dred_rdovae.c
new file mode 100644
index 00000000..9805d0fd
--- /dev/null
+++ b/dnn/dred_rdovae.c
@@ -0,0 +1,136 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "dred_rdovae.h"
+#include "dred_rdovae_enc.h"
+#include "dred_rdovae_dec.h"
+#include "dred_rdovae_stats_data.h"
+
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents)
+{
+    int i;
+    RDOVAEDecState dec;
+    memset(&dec, 0, sizeof(dec));
+    DRED_rdovae_dec_init_states(&dec, model, state);
+    for (i = 0; i < 2*nb_latents; i += 2)
+    {
+        DRED_rdovae_decode_qframe(
+            &dec,
+            model,
+            &features[2*i*DRED_NUM_FEATURES],
+            &latents[(i/2)*DRED_LATENT_DIM]);
+    }
+}
+
+size_t DRED_rdovae_get_enc_size()
+{
+    return sizeof(RDOVAEEncState);
+}
+
+size_t DRED_rdovae_get_dec_size()
+{
+    return sizeof(RDOVAEDecState);
+}
+
+void DRED_rdovae_init_encoder(RDOVAEEncState *enc_state)
+{
+    memset(enc_state, 0, sizeof(*enc_state));
+
+}
+
+void DRED_rdovae_init_decoder(RDOVAEDecState *dec_state)
+{
+    memset(dec_state, 0, sizeof(*dec_state));
+}
+
+
+RDOVAEEncState * DRED_rdovae_create_encoder()
+{
+    RDOVAEEncState *enc;
+    enc = (RDOVAEEncState*) calloc(sizeof(*enc), 1);
+    DRED_rdovae_init_encoder(enc);
+    return enc;
+}
+
+RDOVAEDecState * DRED_rdovae_create_decoder()
+{
+    RDOVAEDecState *dec;
+    dec = (RDOVAEDecState*) calloc(sizeof(*dec), 1);
+    DRED_rdovae_init_decoder(dec);
+    return dec;
+}
+
+void DRED_rdovae_destroy_decoder(RDOVAEDecState* dec)
+{
+    free(dec);
+}
+
+void DRED_rdovae_destroy_encoder(RDOVAEEncState* enc)
+{
+    free(enc);
+}
+
+void DRED_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input)
+{
+    dred_rdovae_encode_dframe(enc_state, model, latents, initial_state, input);
+}
+
+void DRED_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state)
+{
+    dred_rdovae_dec_init_states(h, model, initial_state);
+}
+
+void DRED_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float *z)
+{
+    dred_rdovae_decode_qframe(h, model, qframe, z);
+}
+
+
+const opus_uint16 * DRED_rdovae_get_p0_pointer(void)
+{
+    return &dred_p0_q15[0];
+}
+
+const opus_uint16 * DRED_rdovae_get_dead_zone_pointer(void)
+{
+    return &dred_dead_zone_q10[0];
+}
+
+const opus_uint16 * DRED_rdovae_get_r_pointer(void)
+{
+    return &dred_r_q15[0];
+}
+
+const opus_uint16 * DRED_rdovae_get_quant_scales_pointer(void)
+{
+    return &dred_quant_scales_q8[0];
+}
diff --git a/dnn/dred_rdovae_dec.c b/dnn/dred_rdovae_dec.c
new file mode 100644
index 00000000..3cf2d69a
--- /dev/null
+++ b/dnn/dred_rdovae_dec.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "common.h"
+#include "dred_rdovae_dec.h"
+#include "dred_rdovae_constants.h"
+
+
+void dred_rdovae_dec_init_states(
+    RDOVAEDecState *h,            /* io: state buffer handle */
+    const RDOVAEDec *model,
+    const float *initial_state  /* i: initial state */
+    )
+{
+    /* initialize GRU states from initial state */
+    _lpcnet_compute_dense(&model->state1, h->dense2_state, initial_state);
+    _lpcnet_compute_dense(&model->state2, h->dense4_state, initial_state);
+    _lpcnet_compute_dense(&model->state3, h->dense6_state, initial_state);
+}
+
+
+void dred_rdovae_decode_qframe(
+    RDOVAEDecState *dec_state,       /* io: state buffer handle */
+    const RDOVAEDec *model,
+    float *qframe,              /* o: quadruple feature frame (four concatenated frames in reverse order) */
+    const float *input          /* i: latent vector */
+    )
+{
+    float buffer[DEC_DENSE1_OUT_SIZE + DEC_DENSE2_OUT_SIZE + DEC_DENSE3_OUT_SIZE + DEC_DENSE4_OUT_SIZE + DEC_DENSE5_OUT_SIZE + DEC_DENSE6_OUT_SIZE + DEC_DENSE7_OUT_SIZE + DEC_DENSE8_OUT_SIZE];
+    int output_index = 0;
+    int input_index = 0;
+    float zero_vector[1024] = {0};
+
+    /* run encoder stack and concatenate output in buffer*/
+    _lpcnet_compute_dense(&model->dec_dense1, &buffer[output_index], input);
+    input_index = output_index;
+    output_index += DEC_DENSE1_OUT_SIZE;
+
+    compute_gruB(&model->dec_dense2, zero_vector, dec_state->dense2_state, &buffer[input_index]);
+    RNN_COPY(&buffer[output_index], dec_state->dense2_state, DEC_DENSE2_OUT_SIZE);
+    input_index = output_index;
+    output_index += DEC_DENSE2_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->dec_dense3, &buffer[output_index], &buffer[input_index]);
+    input_index = output_index;
+    output_index += DEC_DENSE3_OUT_SIZE;
+
+    compute_gruB(&model->dec_dense4, zero_vector, dec_state->dense4_state, &buffer[input_index]);
+    RNN_COPY(&buffer[output_index], dec_state->dense4_state, DEC_DENSE4_OUT_SIZE);
+    input_index = output_index;
+    output_index += DEC_DENSE4_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->dec_dense5, &buffer[output_index], &buffer[input_index]);
+    input_index = output_index;
+    output_index += DEC_DENSE5_OUT_SIZE;
+
+    compute_gruB(&model->dec_dense6, zero_vector, dec_state->dense6_state, &buffer[input_index]);
+    RNN_COPY(&buffer[output_index], dec_state->dense6_state, DEC_DENSE6_OUT_SIZE);
+    input_index = output_index;
+    output_index += DEC_DENSE6_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->dec_dense7, &buffer[output_index], &buffer[input_index]);
+    input_index = output_index;
+    output_index += DEC_DENSE7_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->dec_dense8, &buffer[output_index], &buffer[input_index]);
+    output_index += DEC_DENSE8_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->dec_final, qframe, buffer);
+}
diff --git a/dnn/dred_rdovae_dec.h b/dnn/dred_rdovae_dec.h
new file mode 100644
index 00000000..008551b5
--- /dev/null
+++ b/dnn/dred_rdovae_dec.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _DRED_RDOVAE_DEC_H
+#define _DRED_RDOVAE_DEC_H
+
+#include "dred_rdovae.h"
+#include "dred_rdovae_dec_data.h"
+#include "dred_rdovae_stats_data.h"
+
+struct RDOVAEDecStruct {
+    float dense2_state[DEC_DENSE2_STATE_SIZE];
+    float dense4_state[DEC_DENSE2_STATE_SIZE];
+    float dense6_state[DEC_DENSE2_STATE_SIZE];
+};
+
+void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);
+void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);
+
+#endif
diff --git a/dnn/dred_rdovae_enc.c b/dnn/dred_rdovae_enc.c
new file mode 100644
index 00000000..9fb93cd8
--- /dev/null
+++ b/dnn/dred_rdovae_enc.c
@@ -0,0 +1,95 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <math.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "dred_rdovae_enc.h"
+#include "common.h"
+
+void dred_rdovae_encode_dframe(
+    RDOVAEEncState *enc_state,           /* io: encoder state */
+    const RDOVAEEnc *model,
+    float *latents,                 /* o: latent vector */
+    float *initial_state,           /* o: initial state */
+    const float *input              /* i: double feature frame (concatenated) */
+    )
+{
+    float buffer[ENC_DENSE1_OUT_SIZE + ENC_DENSE2_OUT_SIZE + ENC_DENSE3_OUT_SIZE + ENC_DENSE4_OUT_SIZE + ENC_DENSE5_OUT_SIZE + ENC_DENSE6_OUT_SIZE + ENC_DENSE7_OUT_SIZE + ENC_DENSE8_OUT_SIZE + GDENSE1_OUT_SIZE];
+    int output_index = 0;
+    int input_index = 0;
+    float zero_vector[1024] = {0};
+
+    /* run encoder stack and concatenate output in buffer*/
+    _lpcnet_compute_dense(&model->enc_dense1, &buffer[output_index], input);
+    input_index = output_index;
+    output_index += ENC_DENSE1_OUT_SIZE;
+
+    compute_gruB(&model->enc_dense2, zero_vector, enc_state->dense2_state, &buffer[input_index]);
+    RNN_COPY(&buffer[output_index], enc_state->dense2_state, ENC_DENSE2_OUT_SIZE);
+    input_index = output_index;
+    output_index += ENC_DENSE2_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->enc_dense3, &buffer[output_index], &buffer[input_index]);
+    input_index = output_index;
+    output_index += ENC_DENSE3_OUT_SIZE;
+
+    compute_gruB(&model->enc_dense4, zero_vector, enc_state->dense4_state, &buffer[input_index]);
+    RNN_COPY(&buffer[output_index], enc_state->dense4_state, ENC_DENSE4_OUT_SIZE);
+    input_index = output_index;
+    output_index += ENC_DENSE4_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->enc_dense5, &buffer[output_index], &buffer[input_index]);
+    input_index = output_index;
+    output_index += ENC_DENSE5_OUT_SIZE;
+
+    compute_gruB(&model->enc_dense6, zero_vector, enc_state->dense6_state, &buffer[input_index]);
+    RNN_COPY(&buffer[output_index], enc_state->dense6_state, ENC_DENSE6_OUT_SIZE);
+    input_index = output_index;
+    output_index += ENC_DENSE6_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->enc_dense7, &buffer[output_index], &buffer[input_index]);
+    input_index = output_index;
+    output_index += ENC_DENSE7_OUT_SIZE;
+
+    _lpcnet_compute_dense(&model->enc_dense8, &buffer[output_index], &buffer[input_index]);
+    output_index += ENC_DENSE8_OUT_SIZE;
+
+    /* compute latents from concatenated input buffer */
+    compute_conv1d(&model->bits_dense, latents, enc_state->bits_dense_state, buffer);
+
+
+    /* next, calculate initial state */
+    _lpcnet_compute_dense(&model->gdense1, &buffer[output_index], buffer);
+    input_index = output_index;
+    _lpcnet_compute_dense(&model->gdense2, initial_state, &buffer[input_index]);
+
+}
diff --git a/dnn/dred_rdovae_enc.h b/dnn/dred_rdovae_enc.h
new file mode 100644
index 00000000..70ff6adc
--- /dev/null
+++ b/dnn/dred_rdovae_enc.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _DRED_RDOVAE_ENC_H
+#define _DRED_RDOVAE_ENC_H
+
+#include "dred_rdovae.h"
+
+#include "dred_rdovae_enc_data.h"
+
+struct RDOVAEEncStruct {
+    float dense2_state[3 * ENC_DENSE2_STATE_SIZE];
+    float dense4_state[3 * ENC_DENSE4_STATE_SIZE];
+    float dense6_state[3 * ENC_DENSE6_STATE_SIZE];
+    float bits_dense_state[BITS_DENSE_STATE_SIZE];
+};
+
+void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);
+
+
+#endif
diff --git a/dnn/dump_data.c b/dnn/dump_data.c
new file mode 100644
index 00000000..cc272993
--- /dev/null
+++ b/dnn/dump_data.c
@@ -0,0 +1,307 @@
+/* Copyright (c) 2017-2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "kiss_fft.h"
+#include "common.h"
+#include <math.h>
+#include "freq.h"
+#include "pitch.h"
+#include "arch.h"
+#include <assert.h>
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+
+
+static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
+  int i;
+  for (i=0;i<N;i++) {
+    float xi, yi;
+    xi = x[i];
+    yi = x[i] + mem[0];
+    mem[0] = mem[1] + (b[0]*(double)xi - a[0]*(double)yi);
+    mem[1] = (b[1]*(double)xi - a[1]*(double)yi);
+    y[i] = yi;
+  }
+}
+
+static float uni_rand() {
+  return rand()/(double)RAND_MAX-.5;
+}
+
+static void rand_resp(float *a, float *b) {
+  a[0] = .75*uni_rand();
+  a[1] = .75*uni_rand();
+  b[0] = .75*uni_rand();
+  b[1] = .75*uni_rand();
+}
+
+void compute_noise(int *noise, float noise_std) {
+  int i;
+  for (i=0;i<FRAME_SIZE;i++) {
+    noise[i] = (int)floor(.5 + noise_std*.707*(log_approx((float)rand()/RAND_MAX)-log_approx((float)rand()/RAND_MAX)));
+  }
+}
+
+static short float2short(float x)
+{
+  int i;
+  i = (int)floor(.5+x);
+  return IMAX(-32767, IMIN(32767, i));
+}
+
+
+void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes) {
+  int i, k;
+  for (k=0;k<nframes;k++) {
+  short data[2*FRAME_SIZE];
+  for (i=0;i<FRAME_SIZE;i++) {
+    float p=0;
+    float e;
+    int j;
+    for (j=0;j<LPC_ORDER;j++) p -= st->features[k][NB_BANDS+2+j]*st->sig_mem[j];
+    e = lin2ulaw(pcm[k*FRAME_SIZE+i] - p);
+    /* Signal in. */
+    data[2*i] = float2short(st->sig_mem[0]);
+    /* Signal out. */
+    data[2*i+1] = pcm[k*FRAME_SIZE+i];
+    /* Simulate error on excitation. */
+    e += noise[k*FRAME_SIZE+i];
+    e = IMIN(255, IMAX(0, e));
+    
+    RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
+    st->sig_mem[0] = p + ulaw2lin(e);
+    st->exc_mem = e;
+  }
+  fwrite(data, 4*FRAME_SIZE, 1, file);
+  }
+}
+
+int main(int argc, char **argv) {
+  int i;
+  char *argv0;
+  int count=0;
+  static const float a_hp[2] = {-1.99599, 0.99600};
+  static const float b_hp[2] = {-2, 1};
+  float a_sig[2] = {0};
+  float b_sig[2] = {0};
+  float mem_hp_x[2]={0};
+  float mem_resp_x[2]={0};
+  float mem_preemph=0;
+  float x[FRAME_SIZE];
+  int gain_change_count=0;
+  FILE *f1;
+  FILE *ffeat;
+  FILE *fpcm=NULL;
+  short pcm[FRAME_SIZE]={0};
+  short pcmbuf[FRAME_SIZE*4]={0};
+  int noisebuf[FRAME_SIZE*4]={0};
+  short tmp[FRAME_SIZE] = {0};
+  float savedX[FRAME_SIZE] = {0};
+  float speech_gain=1;
+  int last_silent = 1;
+  float old_speech_gain = 1;
+  int one_pass_completed = 0;
+  LPCNetEncState *st;
+  float noise_std=0;
+  int training = -1;
+  int encode = 0;
+  int decode = 0;
+  int quantize = 0;
+  int burg = 0;
+  srand(getpid());
+  st = lpcnet_encoder_create();
+  argv0=argv[0];
+  if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
+      burg = 1;
+      training = 1;
+  }
+  if (argc == 4 && strcmp(argv[1], "-btest")==0) {
+      burg = 1;
+      training = 0;
+  }
+  if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
+  if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
+      training = 1;
+      quantize = 1;
+  }
+  if (argc == 4 && strcmp(argv[1], "-test")==0) training = 0;
+  if (argc == 4 && strcmp(argv[1], "-qtest")==0) {
+      training = 0;
+      quantize = 1;
+  }
+  if (argc == 4 && strcmp(argv[1], "-encode")==0) {
+      training = 0;
+      quantize = 1;
+      encode = 1;
+  }
+  if (argc == 4 && strcmp(argv[1], "-decode")==0) {
+      training = 0;
+      decode = 1;
+  }
+  if (training == -1) {
+    fprintf(stderr, "usage: %s -train <speech> <features out> <pcm out>\n", argv0);
+    fprintf(stderr, "  or   %s -test <speech> <features out>\n", argv0);
+    return 1;
+  }
+  f1 = fopen(argv[2], "r");
+  if (f1 == NULL) {
+    fprintf(stderr,"Error opening input .s16 16kHz speech input file: %s\n", argv[2]);
+    exit(1);
+  }
+  ffeat = fopen(argv[3], "wb");
+  if (ffeat == NULL) {
+    fprintf(stderr,"Error opening output feature file: %s\n", argv[3]);
+    exit(1);
+  }
+  if (decode) {
+    float vq_mem[NB_BANDS] = {0};
+    while (1) {
+      int ret;
+      unsigned char buf[8];
+      float features[4][NB_TOTAL_FEATURES];
+      /*int c0_id, main_pitch, modulation, corr_id, vq_end[3], vq_mid, interp_id;*/
+      /*ret = fscanf(f1, "%d %d %d %d %d %d %d %d %d\n", &c0_id, &main_pitch, &modulation, &corr_id, &vq_end[0], &vq_end[1], &vq_end[2], &vq_mid, &interp_id);*/
+      ret = fread(buf, 1, 8, f1);
+      if (ret != 8) break;
+      decode_packet(features, vq_mem, buf);
+      for (i=0;i<4;i++) {
+        fwrite(features[i], sizeof(float), NB_TOTAL_FEATURES, ffeat);
+      }
+    }
+    return 0;
+  }
+  if (training) {
+    fpcm = fopen(argv[4], "wb");
+    if (fpcm == NULL) {
+      fprintf(stderr,"Error opening output PCM file: %s\n", argv[4]);
+      exit(1);
+    }
+  }
+  while (1) {
+    float E=0;
+    int silent;
+    size_t ret;
+    for (i=0;i<FRAME_SIZE;i++) x[i] = tmp[i];
+    ret = fread(tmp, sizeof(short), FRAME_SIZE, f1);
+    if (feof(f1) || ret != FRAME_SIZE) {
+      if (!training) break;
+      rewind(f1);
+      ret = fread(tmp, sizeof(short), FRAME_SIZE, f1);
+      if (ret != FRAME_SIZE) {
+        fprintf(stderr, "error reading\n");
+        exit(1);
+      }
+      one_pass_completed = 1;
+    }
+    for (i=0;i<FRAME_SIZE;i++) E += tmp[i]*(float)tmp[i];
+    if (0 && training) {
+      silent = E < 5000 || (last_silent && E < 20000);
+      if (!last_silent && silent) {
+        for (i=0;i<FRAME_SIZE;i++) savedX[i] = x[i];
+      }
+      if (last_silent && !silent) {
+          for (i=0;i<FRAME_SIZE;i++) {
+            float f = (float)i/FRAME_SIZE;
+            tmp[i] = (int)floor(.5 + f*tmp[i] + (1-f)*savedX[i]);
+          }
+      }
+      if (last_silent) {
+        last_silent = silent;
+        continue;
+      }
+      last_silent = silent;
+    }
+    if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;
+    if (training && ++gain_change_count > 2821) {
+      float tmp, tmp2;
+      speech_gain = pow(10., (-30+(rand()%40))/20.);
+      if (rand()&1) speech_gain = -speech_gain;
+      if (rand()%20==0) speech_gain *= .01;
+      if (rand()%100==0) speech_gain = 0;
+      gain_change_count = 0;
+      rand_resp(a_sig, b_sig);
+      tmp = (float)rand()/RAND_MAX;
+      tmp2 = (float)rand()/RAND_MAX;
+      noise_std = ABS16(-1.5*log(1e-4+tmp)-.5*log(1e-4+tmp2));
+    }
+    biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
+    biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
+    for (i=0;i<FRAME_SIZE;i++) {
+      float g;
+      float f = (float)i/FRAME_SIZE;
+      g = f*speech_gain + (1-f)*old_speech_gain;
+      x[i] *= g;
+    }
+    if (burg) {
+      float ceps[2*NB_BANDS];
+      burg_cepstral_analysis(ceps, x);
+      fwrite(ceps, sizeof(float), 2*NB_BANDS, ffeat);
+    }
+    preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+    for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
+    /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
+    for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
+    compute_frame_features(st, x);
+
+    RNN_COPY(&pcmbuf[st->pcount*FRAME_SIZE], pcm, FRAME_SIZE);
+    if (fpcm) {
+        compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std);
+    }
+    
+    if (!quantize) {
+      process_single_frame(st, ffeat);
+      if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);
+    }
+    st->pcount++;
+    /* Running on groups of 4 frames. */
+    if (st->pcount == 4) {
+      if (quantize) {
+        unsigned char buf[8];
+        process_superframe(st, buf, ffeat, encode, quantize);
+        if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4);
+      }
+      st->pcount = 0;
+    }
+    /*if (fpcm) fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm);*/
+    for (i=0;i<TRAINING_OFFSET;i++) pcm[i] = float2short(x[i+FRAME_SIZE-TRAINING_OFFSET]);
+    old_speech_gain = speech_gain;
+    count++;
+  }
+  fclose(f1);
+  fclose(ffeat);
+  if (fpcm) fclose(fpcm);
+  lpcnet_encoder_destroy(st);
+  return 0;
+}
+
diff --git a/dnn/dump_lpcnet_tables.c b/dnn/dump_lpcnet_tables.c
new file mode 100644
index 00000000..95084d6e
--- /dev/null
+++ b/dnn/dump_lpcnet_tables.c
@@ -0,0 +1,104 @@
+/* Copyright (c) 2017-2018 Mozilla
+   Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include "freq.h"
+#include "kiss_fft.h"
+
+
+int main(void) {
+  int i;
+  FILE *file;
+  kiss_fft_state *kfft;
+  float half_window[OVERLAP_SIZE];
+  float dct_table[NB_BANDS*NB_BANDS];
+
+  file=fopen("lpcnet_tables.c", "wb");
+  fprintf(file, "/* The contents of this file was automatically generated by dump_lpcnet_tables.c*/\n\n");
+  fprintf(file, "#ifdef HAVE_CONFIG_H\n");
+  fprintf(file, "#include \"config.h\"\n");
+  fprintf(file, "#endif\n");
+
+  fprintf(file, "#include \"kiss_fft.h\"\n\n");
+
+  kfft = opus_fft_alloc_twiddles(WINDOW_SIZE, NULL, NULL, NULL, 0);
+
+  fprintf(file, "static const arch_fft_state arch_fft = {0, NULL};\n\n");
+
+  fprintf (file, "static const opus_int16 fft_bitrev[%d] = {\n", kfft->nfft);
+  for (i=0;i<kfft->nfft;i++)
+    fprintf (file, "%d,%c", kfft->bitrev[i],(i+16)%15==0?'\n':' ');
+  fprintf (file, "};\n\n");
+
+  fprintf (file, "static const kiss_twiddle_cpx fft_twiddles[%d] = {\n", kfft->nfft);
+  for (i=0;i<kfft->nfft;i++)
+    fprintf (file, "{%#0.9gf, %#0.9gf},%c", kfft->twiddles[i].r, kfft->twiddles[i].i,(i+3)%2==0?'\n':' ');
+  fprintf (file, "};\n\n");
+
+
+  fprintf(file, "const kiss_fft_state kfft = {\n");
+  fprintf(file, "%d, /* nfft */\n", kfft->nfft);
+  fprintf(file, "%#0.8gf, /* scale */\n", kfft->scale);
+  fprintf(file, "%d, /* shift */\n", kfft->shift);
+  fprintf(file, "{");
+  for (i=0;i<2*MAXFACTORS;i++) {
+    fprintf(file, "%d, ", kfft->factors[i]);
+  }
+  fprintf(file, "}, /* factors */\n");
+  fprintf(file, "fft_bitrev, /* bitrev*/\n");
+  fprintf(file, "fft_twiddles, /* twiddles*/\n");
+  fprintf(file, "(arch_fft_state *)&arch_fft, /* arch_fft*/\n");
+
+  fprintf(file, "};\n\n");
+
+  for (i=0;i<OVERLAP_SIZE;i++)
+    half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/OVERLAP_SIZE) * sin(.5*M_PI*(i+.5)/OVERLAP_SIZE));
+  fprintf(file, "const float half_window[] = {\n");
+  for (i=0;i<OVERLAP_SIZE;i++)
+    fprintf (file, "%#0.9gf,%c", half_window[i],(i+6)%5==0?'\n':' ');
+  fprintf(file, "};\n\n");
+
+  for (i=0;i<NB_BANDS;i++) {
+    int j;
+    for (j=0;j<NB_BANDS;j++) {
+      dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
+      if (j==0) dct_table[i*NB_BANDS + j] *= sqrt(.5);
+    }
+  }
+  fprintf(file, "const float dct_table[] = {\n");
+  for (i=0;i<NB_BANDS*NB_BANDS;i++)
+    fprintf (file, "%#0.9gf,%c", dct_table[i],(i+6)%5==0?'\n':' ');
+  fprintf(file, "};\n");
+
+  fclose(file);
+  return 0;
+}
diff --git a/dnn/freq.c b/dnn/freq.c
new file mode 100644
index 00000000..ee867d39
--- /dev/null
+++ b/dnn/freq.c
@@ -0,0 +1,329 @@
+/* Copyright (c) 2017-2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "kiss_fft.h"
+#include "common.h"
+#include <math.h>
+#include "freq.h"
+#include "pitch.h"
+#include "arch.h"
+#include "burg.h"
+#include <assert.h>
+
+#define SQUARE(x) ((x)*(x))
+
+static const opus_int16 eband5ms[] = {
+/*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k*/
+  0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40
+};
+
+static const float compensation[] = {
+    0.8f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.666667f, 0.5f, 0.5f, 0.5f, 0.333333f, 0.25f, 0.25f, 0.2f, 0.166667f, 0.173913f
+};
+
+
+extern const kiss_fft_state kfft;
+extern const float half_window[OVERLAP_SIZE];
+extern const float dct_table[NB_BANDS*NB_BANDS];
+
+
+static void compute_band_energy_inverse(float *bandE, const kiss_fft_cpx *X) {
+  int i;
+  float sum[NB_BANDS] = {0};
+  for (i=0;i<NB_BANDS-1;i++)
+  {
+    int j;
+    int band_size;
+    band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
+    for (j=0;j<band_size;j++) {
+      float tmp;
+      float frac = (float)j/band_size;
+      tmp = SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r);
+      tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
+      tmp = 1.f/(tmp + 1e-9);
+      sum[i] += (1-frac)*tmp;
+      sum[i+1] += frac*tmp;
+    }
+  }
+  sum[0] *= 2;
+  sum[NB_BANDS-1] *= 2;
+  for (i=0;i<NB_BANDS;i++)
+  {
+    bandE[i] = sum[i];
+  }
+}
+
+static float lpcn_lpc(
+      opus_val16 *lpc, /* out: [0...p-1] LPC coefficients      */
+      opus_val16 *rc,
+const opus_val32 *ac,  /* in:  [0...p] autocorrelation values  */
+int          p
+)
+{
+   int i, j;
+   opus_val32 r;
+   opus_val32 error = ac[0];
+
+   RNN_CLEAR(lpc, p);
+   RNN_CLEAR(rc, p);
+   if (ac[0] != 0)
+   {
+      for (i = 0; i < p; i++) {
+         /* Sum up this iteration's reflection coefficient */
+         opus_val32 rr = 0;
+         for (j = 0; j < i; j++)
+            rr += MULT32_32_Q31(lpc[j],ac[i - j]);
+         rr += SHR32(ac[i + 1],3);
+         r = -SHL32(rr,3)/error;
+         rc[i] = r;
+         /*  Update LPC coefficients and total error */
+         lpc[i] = SHR32(r,3);
+         for (j = 0; j < (i+1)>>1; j++)
+         {
+            opus_val32 tmp1, tmp2;
+            tmp1 = lpc[j];
+            tmp2 = lpc[i-1-j];
+            lpc[j]     = tmp1 + MULT32_32_Q31(r,tmp2);
+            lpc[i-1-j] = tmp2 + MULT32_32_Q31(r,tmp1);
+         }
+
+         error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
+         /* Bail out once we get 30 dB gain */
+         if (error<.001f*ac[0])
+            break;
+      }
+   }
+   return error;
+}
+
+
+
+void lpcn_compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
+  int i;
+  float sum[NB_BANDS] = {0};
+  for (i=0;i<NB_BANDS-1;i++)
+  {
+    int j;
+    int band_size;
+    band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
+    for (j=0;j<band_size;j++) {
+      float tmp;
+      float frac = (float)j/band_size;
+      tmp = SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r);
+      tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
+      sum[i] += (1-frac)*tmp;
+      sum[i+1] += frac*tmp;
+    }
+  }
+  sum[0] *= 2;
+  sum[NB_BANDS-1] *= 2;
+  for (i=0;i<NB_BANDS;i++)
+  {
+    bandE[i] = sum[i];
+  }
+}
+
+static void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order) {
+  int i;
+  float burg_in[FRAME_SIZE];
+  float burg_lpc[LPC_ORDER];
+  float x[WINDOW_SIZE];
+  float Eburg[NB_BANDS];
+  float g;
+  kiss_fft_cpx LPC[FREQ_SIZE];
+  float Ly[NB_BANDS];
+  float logMax = -2;
+  float follow = -2;
+  assert(order <= LPC_ORDER);
+  assert(len <= FRAME_SIZE);
+  for (i=0;i<len-1;i++) burg_in[i] = pcm[i+1] - PREEMPHASIS*pcm[i];
+  g = silk_burg_analysis(burg_lpc, burg_in, 1e-3, len-1, 1, order);
+  g /= len - 2*(order-1);
+  RNN_CLEAR(x, WINDOW_SIZE);
+  x[0] = 1;
+  for (i=0;i<order;i++) x[i+1] = -burg_lpc[i]*pow(.995, i+1);
+  forward_transform(LPC, x);
+  compute_band_energy_inverse(Eburg, LPC);
+  for (i=0;i<NB_BANDS;i++) Eburg[i] *= .45*g*(1.f/((float)WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE));
+  for (i=0;i<NB_BANDS;i++) {
+    Ly[i] = log10(1e-2+Eburg[i]);
+    Ly[i] = MAX16(logMax-8, MAX16(follow-2.5, Ly[i]));
+    logMax = MAX16(logMax, Ly[i]);
+    follow = MAX16(follow-2.5, Ly[i]);
+  }
+  dct(burg_cepstrum, Ly);
+  burg_cepstrum[0] += - 4;
+}
+
+void burg_cepstral_analysis(float *ceps, const float *x) {
+  int i;
+  compute_burg_cepstrum(x,                &ceps[0       ], FRAME_SIZE/2, LPC_ORDER);
+  compute_burg_cepstrum(&x[FRAME_SIZE/2], &ceps[NB_BANDS], FRAME_SIZE/2, LPC_ORDER);
+  for (i=0;i<NB_BANDS;i++) {
+    float c0, c1;
+    c0 = ceps[i];
+    c1 = ceps[NB_BANDS+i];
+    ceps[i         ] = .5*(c0+c1);
+    ceps[NB_BANDS+i] = (c0-c1);
+  }
+}
+
+
+static void interp_band_gain(float *g, const float *bandE) {
+  int i;
+  memset(g, 0, FREQ_SIZE);
+  for (i=0;i<NB_BANDS-1;i++)
+  {
+    int j;
+    int band_size;
+    band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
+    for (j=0;j<band_size;j++) {
+      float frac = (float)j/band_size;
+      g[(eband5ms[i]*WINDOW_SIZE_5MS) + j] = (1-frac)*bandE[i] + frac*bandE[i+1];
+    }
+  }
+}
+
+
+void dct(float *out, const float *in) {
+  int i;
+  for (i=0;i<NB_BANDS;i++) {
+    int j;
+    float sum = 0;
+    for (j=0;j<NB_BANDS;j++) {
+      sum += in[j] * dct_table[j*NB_BANDS + i];
+    }
+    out[i] = sum*sqrt(2./NB_BANDS);
+  }
+}
+
+static void idct(float *out, const float *in) {
+  int i;
+  for (i=0;i<NB_BANDS;i++) {
+    int j;
+    float sum = 0;
+    for (j=0;j<NB_BANDS;j++) {
+      sum += in[j] * dct_table[i*NB_BANDS + j];
+    }
+    out[i] = sum*sqrt(2./NB_BANDS);
+  }
+}
+
+void forward_transform(kiss_fft_cpx *out, const float *in) {
+  int i;
+  kiss_fft_cpx x[WINDOW_SIZE];
+  kiss_fft_cpx y[WINDOW_SIZE];
+  for (i=0;i<WINDOW_SIZE;i++) {
+    x[i].r = in[i];
+    x[i].i = 0;
+  }
+  opus_fft(&kfft, x, y, 0);
+  for (i=0;i<FREQ_SIZE;i++) {
+    out[i] = y[i];
+  }
+}
+
+static void inverse_transform(float *out, const kiss_fft_cpx *in) {
+  int i;
+  kiss_fft_cpx x[WINDOW_SIZE];
+  kiss_fft_cpx y[WINDOW_SIZE];
+  for (i=0;i<FREQ_SIZE;i++) {
+    x[i] = in[i];
+  }
+  for (;i<WINDOW_SIZE;i++) {
+    x[i].r = x[WINDOW_SIZE - i].r;
+    x[i].i = -x[WINDOW_SIZE - i].i;
+  }
+  opus_fft(&kfft, x, y, 0);
+  /* output in reverse order for IFFT. */
+  out[0] = WINDOW_SIZE*y[0].r;
+  for (i=1;i<WINDOW_SIZE;i++) {
+    out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r;
+  }
+}
+
+static float lpc_from_bands(float *lpc, const float *Ex)
+{
+   int i;
+   float e;
+   float ac[LPC_ORDER+1];
+   float rc[LPC_ORDER];
+   float Xr[FREQ_SIZE];
+   kiss_fft_cpx X_auto[FREQ_SIZE];
+   float x_auto[WINDOW_SIZE];
+   interp_band_gain(Xr, Ex);
+   Xr[FREQ_SIZE-1] = 0;
+   RNN_CLEAR(X_auto, FREQ_SIZE);
+   for (i=0;i<FREQ_SIZE;i++) X_auto[i].r = Xr[i];
+   inverse_transform(x_auto, X_auto);
+   for (i=0;i<LPC_ORDER+1;i++) ac[i] = x_auto[i];
+
+   /* -40 dB noise floor. */
+   ac[0] += ac[0]*1e-4 + 320/12/38.;
+   /* Lag windowing. */
+   for (i=1;i<LPC_ORDER+1;i++) ac[i] *= (1 - 6e-5*i*i);
+   e = lpcn_lpc(lpc, rc, ac, LPC_ORDER);
+   return e;
+}
+
+void lpc_weighting(float *lpc, float gamma)
+{
+  int i;
+  float gamma_i = gamma;
+  for (i = 0; i < LPC_ORDER; i++)
+  {
+    lpc[i] *= gamma_i;
+    gamma_i *= gamma;
+  }
+}
+
+float lpc_from_cepstrum(float *lpc, const float *cepstrum)
+{
+   int i;
+   float Ex[NB_BANDS];
+   float tmp[NB_BANDS];
+   RNN_COPY(tmp, cepstrum, NB_BANDS);
+   tmp[0] += 4;
+   idct(Ex, tmp);
+   for (i=0;i<NB_BANDS;i++) Ex[i] = pow(10.f, Ex[i])*compensation[i];
+   return lpc_from_bands(lpc, Ex);
+}
+
+void apply_window(float *x) {
+  int i;
+  for (i=0;i<OVERLAP_SIZE;i++) {
+    x[i] *= half_window[i];
+    x[WINDOW_SIZE - 1 - i] *= half_window[i];
+  }
+}
+
diff --git a/dnn/freq.h b/dnn/freq.h
new file mode 100644
index 00000000..99ebf567
--- /dev/null
+++ b/dnn/freq.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2017-2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FREQ_H
+#define FREQ_H
+
+#include "kiss_fft.h"
+
+#define LPC_ORDER 16
+
+#define PREEMPHASIS (0.85f)
+
+#define FRAME_SIZE_5MS (2)
+#define OVERLAP_SIZE_5MS (2)
+#define TRAINING_OFFSET_5MS (1)
+
+#define WINDOW_SIZE_5MS (FRAME_SIZE_5MS + OVERLAP_SIZE_5MS)
+
+#define FRAME_SIZE (80*FRAME_SIZE_5MS)
+#define OVERLAP_SIZE (80*OVERLAP_SIZE_5MS)
+#define TRAINING_OFFSET (80*TRAINING_OFFSET_5MS)
+#define WINDOW_SIZE (FRAME_SIZE + OVERLAP_SIZE)
+#define FREQ_SIZE (WINDOW_SIZE/2 + 1)
+
+#define NB_BANDS 18
+#define NB_BANDS_1 (NB_BANDS - 1)
+
+void lpcn_compute_band_energy(float *bandE, const kiss_fft_cpx *X);
+void burg_cepstral_analysis(float *ceps, const float *x);
+
+void apply_window(float *x);
+void dct(float *out, const float *in);
+void forward_transform(kiss_fft_cpx *out, const float *in);
+float lpc_from_cepstrum(float *lpc, const float *cepstrum);
+void apply_window(float *x);
+void lpc_weighting(float *lpc, float gamma);
+
+#endif
diff --git a/dnn/include/dred_rdovae.h b/dnn/include/dred_rdovae.h
new file mode 100644
index 00000000..f2c3235e
--- /dev/null
+++ b/dnn/include/dred_rdovae.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DRED_RDOVAE_H
+#define DRED_RDOVAE_H
+
+#include <stdlib.h>
+
+#include "opus_types.h"
+
+typedef struct RDOVAEDec RDOVAEDec;
+typedef struct RDOVAEEnc RDOVAEEnc;
+typedef struct RDOVAEDecStruct RDOVAEDecState;
+typedef struct RDOVAEEncStruct RDOVAEEncState;
+
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents);
+
+
+size_t DRED_rdovae_get_enc_size(void);
+
+size_t DRED_rdovae_get_dec_size(void);
+
+RDOVAEDecState * DRED_rdovae_create_decoder(void);
+RDOVAEEncState * DRED_rdovae_create_encoder(void);
+void DRED_rdovae_destroy_decoder(RDOVAEDecState* h);
+void DRED_rdovae_destroy_encoder(RDOVAEEncState* h);
+
+
+void DRED_rdovae_init_encoder(RDOVAEEncState *enc_state);
+
+void DRED_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);
+
+void DRED_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);
+
+void DRED_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);
+
+const opus_uint16 * DRED_rdovae_get_p0_pointer(void);
+const opus_uint16 * DRED_rdovae_get_dead_zone_pointer(void);
+const opus_uint16 * DRED_rdovae_get_r_pointer(void);
+const opus_uint16 * DRED_rdovae_get_quant_scales_pointer(void);
+
+#endif
diff --git a/dnn/include/lpcnet.h b/dnn/include/lpcnet.h
new file mode 100644
index 00000000..fd12044c
--- /dev/null
+++ b/dnn/include/lpcnet.h
@@ -0,0 +1,217 @@
+/* Copyright (c) 2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _LPCNET_H_
+#define _LPCNET_H_
+
+#ifndef LPCNET_EXPORT
+# if defined(WIN32)
+#  if defined(LPCNET_BUILD) && defined(DLL_EXPORT)
+#   define LPCNET_EXPORT __declspec(dllexport)
+#  else
+#   define LPCNET_EXPORT
+#  endif
+# elif defined(__GNUC__) && defined(LPCNET_BUILD)
+#  define LPCNET_EXPORT __attribute__ ((visibility ("default")))
+# else
+#  define LPCNET_EXPORT
+# endif
+#endif
+
+
+#define NB_FEATURES 20
+#define NB_TOTAL_FEATURES 36
+
+/** Number of bytes in a compressed packet. */
+#define LPCNET_COMPRESSED_SIZE 8
+/** Number of audio samples in a packet. */
+#define LPCNET_PACKET_SAMPLES (4*160)
+/** Number of audio samples in a feature frame (not for encoding/decoding). */
+#define LPCNET_FRAME_SIZE (160)
+
+typedef struct LPCNetState LPCNetState;
+
+typedef struct LPCNetDecState LPCNetDecState;
+
+typedef struct LPCNetEncState LPCNetEncState;
+
+typedef struct LPCNetPLCState LPCNetPLCState;
+
+
+/** Gets the size of an <code>LPCNetDecState</code> structure.
+  * @returns The size in bytes.
+  */
+LPCNET_EXPORT int lpcnet_decoder_get_size(void);
+
+/** Initializes a previously allocated decoder state
+  * The memory pointed to by st must be at least the size returned by lpcnet_decoder_get_size().
+  * This is intended for applications which use their own allocator instead of malloc.
+  * @see lpcnet_decoder_create(),lpcnet_decoder_get_size()
+  * @param [in] st <tt>LPCNetDecState*</tt>: Decoder state
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_decoder_init(LPCNetDecState *st);
+
+LPCNET_EXPORT void lpcnet_reset(LPCNetState *lpcnet);
+
+/** Allocates and initializes a decoder state.
+  *  @returns The newly created state
+  */
+LPCNET_EXPORT LPCNetDecState *lpcnet_decoder_create(void);
+
+/** Frees an <code>LPCNetDecState</code> allocated by lpcnet_decoder_create().
+  * @param[in] st <tt>LPCNetDecState*</tt>: State to be freed.
+  */
+LPCNET_EXPORT void lpcnet_decoder_destroy(LPCNetDecState *st);
+
+/** Decodes a packet of LPCNET_COMPRESSED_SIZE bytes (currently 8) into LPCNET_PACKET_SAMPLES samples (currently 640).
+  * @param [in] st <tt>LPCNetDecState*</tt>: Decoder state
+  * @param [in] buf <tt>const unsigned char *</tt>: Compressed packet
+  * @param [out] pcm <tt>short *</tt>: Decoded audio
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_decode(LPCNetDecState *st, const unsigned char *buf, short *pcm);
+
+
+
+/** Gets the size of an <code>LPCNetEncState</code> structure.
+  * @returns The size in bytes.
+  */
+LPCNET_EXPORT int lpcnet_encoder_get_size(void);
+
+/** Initializes a previously allocated encoder state
+  * The memory pointed to by st must be at least the size returned by lpcnet_encoder_get_size().
+  * This is intended for applications which use their own allocator instead of malloc.
+  * @see lpcnet_encoder_create(),lpcnet_encoder_get_size()
+  * @param [in] st <tt>LPCNetEncState*</tt>: Encoder state
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_encoder_init(LPCNetEncState *st);
+
+/** Allocates and initializes an encoder state.
+  *  @returns The newly created state
+  */
+LPCNET_EXPORT LPCNetEncState *lpcnet_encoder_create(void);
+
+/** Frees an <code>LPCNetEncState</code> allocated by lpcnet_encoder_create().
+  * @param[in] st <tt>LPCNetEncState*</tt>: State to be freed.
+  */
+LPCNET_EXPORT void lpcnet_encoder_destroy(LPCNetEncState *st);
+
+/** Encodes LPCNET_PACKET_SAMPLES speech samples (currently 640) into a packet of LPCNET_COMPRESSED_SIZE bytes (currently 8).
+  * @param [in] st <tt>LPCNetDecState*</tt>: Encoder state
+  * @param [in] pcm <tt>short *</tt>: Input speech to be encoded
+  * @param [out] buf <tt>const unsigned char *</tt>: Compressed packet
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_encode(LPCNetEncState *st, const short *pcm, unsigned char *buf);
+
+/** Compute features on LPCNET_PACKET_SAMPLES speech samples (currently 640) and output features for 4 10-ms frames at once.
+  * @param [in] st <tt>LPCNetDecState*</tt>: Encoder state
+  * @param [in] pcm <tt>short *</tt>: Input speech to be analyzed
+  * @param [out] features <tt>float[4][NB_TOTAL_FEATURES]</tt>: Four feature vectors
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_compute_features(LPCNetEncState *st, const short *pcm, float features[4][NB_TOTAL_FEATURES]);
+
+/** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
+  * @param [in] st <tt>LPCNetDecState*</tt>: Encoder state
+  * @param [in] pcm <tt>short *</tt>: Input speech to be analyzed
+  * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_compute_single_frame_features(LPCNetEncState *st, const short *pcm, float features[NB_TOTAL_FEATURES]);
+
+
+/** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
+  * @param [in] st <tt>LPCNetDecState*</tt>: Encoder state
+  * @param [in] pcm <tt>float *</tt>: Input speech to be analyzed
+  * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]);
+
+/** Gets the size of an <code>LPCNetState</code> structure.
+  * @returns The size in bytes.
+  */
+LPCNET_EXPORT int lpcnet_get_size(void);
+
+/** Initializes a previously allocated synthesis state
+  * The memory pointed to by st must be at least the size returned by lpcnet_get_size().
+  * This is intended for applications which use their own allocator instead of malloc.
+  * @see lpcnet_create(),lpcnet_get_size()
+  * @param [in] st <tt>LPCNetState*</tt>: Synthesis state
+  * @retval 0 Success
+  */
+LPCNET_EXPORT int lpcnet_init(LPCNetState *st);
+
+/** Allocates and initializes a synthesis state.
+  *  @returns The newly created state
+  */
+LPCNET_EXPORT LPCNetState *lpcnet_create(void);
+
+/** Frees an <code>LPCNetState</code> allocated by lpcnet_create().
+  * @param[in] st <tt>LPCNetState*</tt>: State to be freed.
+  */
+LPCNET_EXPORT void lpcnet_destroy(LPCNetState *st);
+
+/** Synthesizes speech from an LPCNet feature vector.
+  * @param [in] st <tt>LPCNetState*</tt>: Synthesis state
+  * @param [in] features <tt>const float *</tt>: Compressed packet
+  * @param [out] output <tt>short **</tt>: Synthesized speech
+  * @param [in] N <tt>int</tt>: Number of samples to generate
+  * @retval 0 Success
+  */
+LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *st, const float *features, short *output, int N);
+
+
+#define LPCNET_PLC_CAUSAL 0
+#define LPCNET_PLC_NONCAUSAL 1
+#define LPCNET_PLC_CODEC 2
+
+#define LPCNET_PLC_DC_FILTER 4
+
+LPCNET_EXPORT int lpcnet_plc_get_size(void);
+
+LPCNET_EXPORT int lpcnet_plc_init(LPCNetPLCState *st, int options);
+LPCNET_EXPORT void lpcnet_plc_reset(LPCNetPLCState *st);
+
+LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create(int options);
+
+LPCNET_EXPORT void lpcnet_plc_destroy(LPCNetPLCState *st);
+
+LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm);
+
+LPCNET_EXPORT int lpcnet_plc_conceal(LPCNetPLCState *st, short *pcm);
+
+LPCNET_EXPORT void lpcnet_plc_fec_add(LPCNetPLCState *st, const float *features);
+
+LPCNET_EXPORT void lpcnet_plc_fec_clear(LPCNetPLCState *st);
+
+LPCNET_EXPORT int lpcnet_load_model(LPCNetState *st, const unsigned char *data, int len);
+LPCNET_EXPORT int lpcnet_plc_load_model(LPCNetPLCState *st, const unsigned char *data, int len);
+
+#endif
diff --git a/dnn/kiss99.c b/dnn/kiss99.c
new file mode 100644
index 00000000..325918f8
--- /dev/null
+++ b/dnn/kiss99.c
@@ -0,0 +1,81 @@
+/*Daala video codec
+Copyright (c) 2012 Daala project contributors.  All rights reserved.
+Author: Timothy B. Terriberry
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "kiss99.h"
+
+void kiss99_srand(kiss99_ctx *_this,const unsigned char *_data,int _ndata){
+  int i;
+  _this->z=362436069;
+  _this->w=521288629;
+  _this->jsr=123456789;
+  _this->jcong=380116160;
+  for(i=3;i<_ndata;i+=4){
+    _this->z^=_data[i-3];
+    _this->w^=_data[i-2];
+    _this->jsr^=_data[i-1];
+    _this->jcong^=_data[i];
+    kiss99_rand(_this);
+  }
+  if(i-3<_ndata)_this->z^=_data[i-3];
+  if(i-2<_ndata)_this->w^=_data[i-2];
+  if(i-1<_ndata)_this->jsr^=_data[i-1];
+  /*Fix any potential short cycles that show up.
+    These are not too likely, given the way we initialize the state, but they
+     are technically possible, so let us go ahead and eliminate that
+     possibility.
+    See Gregory G. Rose: "KISS: A Bit Too Simple", Cryptographic Communications
+     No. 10, pp. 123---137, 2018.*/
+  if(_this->z==0||_this->z==0x9068FFFF)_this->z++;
+  if(_this->w==0||_this->w==0x464FFFFF)_this->w++;
+  if(_this->jsr==0)_this->jsr++;
+}
+
+uint32_t kiss99_rand(kiss99_ctx *_this){
+  uint32_t znew;
+  uint32_t wnew;
+  uint32_t mwc;
+  uint32_t shr3;
+  uint32_t cong;
+  znew=36969*(_this->z&0xFFFF)+(_this->z>>16);
+  wnew=18000*(_this->w&0xFFFF)+(_this->w>>16);
+  mwc=(znew<<16)+wnew;
+  /*We swap the 13 and 17 from the original 1999 algorithm to produce a single
+     cycle of maximal length, matching KISS11.
+    We are not actually using KISS11 because of the impractically large (16 MB)
+     internal state of the full algorithm.*/
+  shr3=_this->jsr^(_this->jsr<<13);
+  shr3^=shr3>>17;
+  shr3^=shr3<<5;
+  cong=69069*_this->jcong+1234567;
+  _this->z=znew;
+  _this->w=wnew;
+  _this->jsr=shr3;
+  _this->jcong=cong;
+  return (mwc^cong)+shr3;
+}
diff --git a/dnn/kiss99.h b/dnn/kiss99.h
new file mode 100644
index 00000000..28646dbc
--- /dev/null
+++ b/dnn/kiss99.h
@@ -0,0 +1,46 @@
+/*Daala video codec
+Copyright (c) 2012 Daala project contributors.  All rights reserved.
+Author: Timothy B. Terriberry
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#if !defined(_kiss99_H)
+# define _kiss99_H (1)
+# include <stdint.h>
+
+/*KISS PRNG from George Marsaglia (1999 version).
+  See https://en.wikipedia.org/wiki/KISS_(algorithm) for details.
+  This is suitable for simulations, but not for use in crytographic contexts.*/
+
+typedef struct kiss99_ctx kiss99_ctx;
+
+struct kiss99_ctx{
+  uint32_t z;
+  uint32_t w;
+  uint32_t jsr;
+  uint32_t jcong;
+};
+
+void kiss99_srand(kiss99_ctx *_this,const unsigned char *_data,int _ndata);
+uint32_t kiss99_rand(kiss99_ctx *_this);
+
+#endif
diff --git a/dnn/kiss_fft.c b/dnn/kiss_fft.c
new file mode 100644
index 00000000..e647214d
--- /dev/null
+++ b/dnn/kiss_fft.c
@@ -0,0 +1,601 @@
+/*Copyright (c) 2003-2004, Mark Borgerding
+  Lots of modifications by Jean-Marc Valin
+  Copyright (c) 2005-2007, Xiph.Org Foundation
+  Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+/* This code is originally from Mark Borgerding's KISS-FFT but has been
+   heavily modified to better suit Opus */
+
+#ifndef SKIP_CONFIG_H
+#  ifdef HAVE_CONFIG_H
+#    include "config.h"
+#  endif
+#endif
+
+#include "_kiss_fft_guts.h"
+#define CUSTOM_MODES
+
+/* The guts header contains all the multiplication and addition macros that are defined for
+   complex numbers.  It also delares the kf_ internal functions.
+*/
+
+static void kf_bfly2(
+                     kiss_fft_cpx * Fout,
+                     int m,
+                     int N
+                    )
+{
+   kiss_fft_cpx * Fout2;
+   int i;
+   (void)m;
+#ifdef CUSTOM_MODES
+   if (m==1)
+   {
+      celt_assert(m==1);
+      for (i=0;i<N;i++)
+      {
+         kiss_fft_cpx t;
+         Fout2 = Fout + 1;
+         t = *Fout2;
+         C_SUB( *Fout2 ,  *Fout , t );
+         C_ADDTO( *Fout ,  t );
+         Fout += 2;
+      }
+   } else
+#endif
+   {
+      opus_val16 tw;
+      tw = QCONST16(0.7071067812f, 15);
+      /* We know that m==4 here because the radix-2 is just after a radix-4 */
+      celt_assert(m==4);
+      for (i=0;i<N;i++)
+      {
+         kiss_fft_cpx t;
+         Fout2 = Fout + 4;
+         t = Fout2[0];
+         C_SUB( Fout2[0] ,  Fout[0] , t );
+         C_ADDTO( Fout[0] ,  t );
+
+         t.r = S_MUL(ADD32_ovflw(Fout2[1].r, Fout2[1].i), tw);
+         t.i = S_MUL(SUB32_ovflw(Fout2[1].i, Fout2[1].r), tw);
+         C_SUB( Fout2[1] ,  Fout[1] , t );
+         C_ADDTO( Fout[1] ,  t );
+
+         t.r = Fout2[2].i;
+         t.i = -Fout2[2].r;
+         C_SUB( Fout2[2] ,  Fout[2] , t );
+         C_ADDTO( Fout[2] ,  t );
+
+         t.r = S_MUL(SUB32_ovflw(Fout2[3].i, Fout2[3].r), tw);
+         t.i = S_MUL(NEG32_ovflw(ADD32_ovflw(Fout2[3].i, Fout2[3].r)), tw);
+         C_SUB( Fout2[3] ,  Fout[3] , t );
+         C_ADDTO( Fout[3] ,  t );
+         Fout += 8;
+      }
+   }
+}
+
+static void kf_bfly4(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   int i;
+
+   if (m==1)
+   {
+      /* Degenerate case where all the twiddles are 1. */
+      for (i=0;i<N;i++)
+      {
+         kiss_fft_cpx scratch0, scratch1;
+
+         C_SUB( scratch0 , *Fout, Fout[2] );
+         C_ADDTO(*Fout, Fout[2]);
+         C_ADD( scratch1 , Fout[1] , Fout[3] );
+         C_SUB( Fout[2], *Fout, scratch1 );
+         C_ADDTO( *Fout , scratch1 );
+         C_SUB( scratch1 , Fout[1] , Fout[3] );
+
+         Fout[1].r = ADD32_ovflw(scratch0.r, scratch1.i);
+         Fout[1].i = SUB32_ovflw(scratch0.i, scratch1.r);
+         Fout[3].r = SUB32_ovflw(scratch0.r, scratch1.i);
+         Fout[3].i = ADD32_ovflw(scratch0.i, scratch1.r);
+         Fout+=4;
+      }
+   } else {
+      int j;
+      kiss_fft_cpx scratch[6];
+      const kiss_twiddle_cpx *tw1,*tw2,*tw3;
+      const int m2=2*m;
+      const int m3=3*m;
+      kiss_fft_cpx * Fout_beg = Fout;
+      for (i=0;i<N;i++)
+      {
+         Fout = Fout_beg + i*mm;
+         tw3 = tw2 = tw1 = st->twiddles;
+         /* m is guaranteed to be a multiple of 4. */
+         for (j=0;j<m;j++)
+         {
+            C_MUL(scratch[0],Fout[m] , *tw1 );
+            C_MUL(scratch[1],Fout[m2] , *tw2 );
+            C_MUL(scratch[2],Fout[m3] , *tw3 );
+
+            C_SUB( scratch[5] , *Fout, scratch[1] );
+            C_ADDTO(*Fout, scratch[1]);
+            C_ADD( scratch[3] , scratch[0] , scratch[2] );
+            C_SUB( scratch[4] , scratch[0] , scratch[2] );
+            C_SUB( Fout[m2], *Fout, scratch[3] );
+            tw1 += fstride;
+            tw2 += fstride*2;
+            tw3 += fstride*3;
+            C_ADDTO( *Fout , scratch[3] );
+
+            Fout[m].r = ADD32_ovflw(scratch[5].r, scratch[4].i);
+            Fout[m].i = SUB32_ovflw(scratch[5].i, scratch[4].r);
+            Fout[m3].r = SUB32_ovflw(scratch[5].r, scratch[4].i);
+            Fout[m3].i = ADD32_ovflw(scratch[5].i, scratch[4].r);
+            ++Fout;
+         }
+      }
+   }
+}
+
+
+#ifndef RADIX_TWO_ONLY
+
+static void kf_bfly3(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   int i;
+   size_t k;
+   const size_t m2 = 2*m;
+   const kiss_twiddle_cpx *tw1,*tw2;
+   kiss_fft_cpx scratch[5];
+   kiss_twiddle_cpx epi3;
+
+   kiss_fft_cpx * Fout_beg = Fout;
+#ifdef FIXED_POINT
+   /*epi3.r = -16384;*/ /* Unused */
+   epi3.i = -28378;
+#else
+   epi3 = st->twiddles[fstride*m];
+#endif
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw1=tw2=st->twiddles;
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
+      k=m;
+      do {
+
+         C_MUL(scratch[1],Fout[m] , *tw1);
+         C_MUL(scratch[2],Fout[m2] , *tw2);
+
+         C_ADD(scratch[3],scratch[1],scratch[2]);
+         C_SUB(scratch[0],scratch[1],scratch[2]);
+         tw1 += fstride;
+         tw2 += fstride*2;
+
+         Fout[m].r = SUB32_ovflw(Fout->r, HALF_OF(scratch[3].r));
+         Fout[m].i = SUB32_ovflw(Fout->i, HALF_OF(scratch[3].i));
+
+         C_MULBYSCALAR( scratch[0] , epi3.i );
+
+         C_ADDTO(*Fout,scratch[3]);
+
+         Fout[m2].r = ADD32_ovflw(Fout[m].r, scratch[0].i);
+         Fout[m2].i = SUB32_ovflw(Fout[m].i, scratch[0].r);
+
+         Fout[m].r = SUB32_ovflw(Fout[m].r, scratch[0].i);
+         Fout[m].i = ADD32_ovflw(Fout[m].i, scratch[0].r);
+
+         ++Fout;
+      } while(--k);
+   }
+}
+
+
+#ifndef OVERRIDE_kf_bfly5
+static void kf_bfly5(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int i, u;
+   kiss_fft_cpx scratch[13];
+   const kiss_twiddle_cpx *tw;
+   kiss_twiddle_cpx ya,yb;
+   kiss_fft_cpx * Fout_beg = Fout;
+
+#ifdef FIXED_POINT
+   ya.r = 10126;
+   ya.i = -31164;
+   yb.r = -26510;
+   yb.i = -19261;
+#else
+   ya = st->twiddles[fstride*m];
+   yb = st->twiddles[fstride*2*m];
+#endif
+   tw=st->twiddles;
+
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
+      for ( u=0; u<m; ++u ) {
+         scratch[0] = *Fout0;
+
+         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+         C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+         C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+         C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+         C_ADD( scratch[7],scratch[1],scratch[4]);
+         C_SUB( scratch[10],scratch[1],scratch[4]);
+         C_ADD( scratch[8],scratch[2],scratch[3]);
+         C_SUB( scratch[9],scratch[2],scratch[3]);
+
+         Fout0->r = ADD32_ovflw(Fout0->r, ADD32_ovflw(scratch[7].r, scratch[8].r));
+         Fout0->i = ADD32_ovflw(Fout0->i, ADD32_ovflw(scratch[7].i, scratch[8].i));
+
+         scratch[5].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,ya.r), S_MUL(scratch[8].r,yb.r)));
+         scratch[5].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,ya.r), S_MUL(scratch[8].i,yb.r)));
+
+         scratch[6].r =  ADD32_ovflw(S_MUL(scratch[10].i,ya.i), S_MUL(scratch[9].i,yb.i));
+         scratch[6].i = NEG32_ovflw(ADD32_ovflw(S_MUL(scratch[10].r,ya.i), S_MUL(scratch[9].r,yb.i)));
+
+         C_SUB(*Fout1,scratch[5],scratch[6]);
+         C_ADD(*Fout4,scratch[5],scratch[6]);
+
+         scratch[11].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,yb.r), S_MUL(scratch[8].r,ya.r)));
+         scratch[11].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,yb.r), S_MUL(scratch[8].i,ya.r)));
+         scratch[12].r = SUB32_ovflw(S_MUL(scratch[9].i,ya.i), S_MUL(scratch[10].i,yb.i));
+         scratch[12].i = SUB32_ovflw(S_MUL(scratch[10].r,yb.i), S_MUL(scratch[9].r,ya.i));
+
+         C_ADD(*Fout2,scratch[11],scratch[12]);
+         C_SUB(*Fout3,scratch[11],scratch[12]);
+
+         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+   }
+}
+#endif /* OVERRIDE_kf_bfly5 */
+
+
+#endif
+
+
+#ifdef CUSTOM_MODES
+
+static
+void compute_bitrev_table(
+         int Fout,
+         opus_int16 *f,
+         const size_t fstride,
+         int in_stride,
+         opus_int16 * factors,
+         const kiss_fft_state *st
+            )
+{
+   const int p=*factors++; /* the radix  */
+   const int m=*factors++; /* stage's fft length/p */
+
+    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
+   if (m==1)
+   {
+      int j;
+      for (j=0;j<p;j++)
+      {
+         *f = Fout+j;
+         f += fstride*in_stride;
+      }
+   } else {
+      int j;
+      for (j=0;j<p;j++)
+      {
+         compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
+         f += fstride*in_stride;
+         Fout += m;
+      }
+   }
+}
+
+/*  facbuf is populated by p1,m1,p2,m2, ...
+    where
+    p[i] * m[i] = m[i-1]
+    m0 = n                  */
+static
+int kf_factor(int n,opus_int16 * facbuf)
+{
+    int p=4;
+    int i;
+    int stages=0;
+    int nbak = n;
+
+    /*factor out powers of 4, powers of 2, then any remaining primes */
+    do {
+        while (n % p) {
+            switch (p) {
+                case 4: p = 2; break;
+                case 2: p = 3; break;
+                default: p += 2; break;
+            }
+            if (p>32000 || (opus_int32)p*(opus_int32)p > n)
+                p = n;          /* no more factors, skip to end */
+        }
+        n /= p;
+#ifdef RADIX_TWO_ONLY
+        if (p!=2 && p != 4)
+#else
+        if (p>5)
+#endif
+        {
+           return 0;
+        }
+        facbuf[2*stages] = p;
+        if (p==2 && stages > 1)
+        {
+           facbuf[2*stages] = 4;
+           facbuf[2] = 2;
+        }
+        stages++;
+    } while (n > 1);
+    n = nbak;
+    /* Reverse the order to get the radix 4 at the end, so we can use the
+       fast degenerate case. It turns out that reversing the order also
+       improves the noise behaviour. */
+    for (i=0;i<stages/2;i++)
+    {
+       int tmp;
+       tmp = facbuf[2*i];
+       facbuf[2*i] = facbuf[2*(stages-i-1)];
+       facbuf[2*(stages-i-1)] = tmp;
+    }
+    for (i=0;i<stages;i++)
+    {
+        n /= facbuf[2*i];
+        facbuf[2*i+1] = n;
+    }
+    return 1;
+}
+
+static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
+{
+   int i;
+#ifdef FIXED_POINT
+   for (i=0;i<nfft;++i) {
+      opus_val32 phase = -i;
+      kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
+   }
+#else
+   for (i=0;i<nfft;++i) {
+      const double pi=3.14159265358979323846264338327;
+      double phase = ( -2*pi /nfft ) * i;
+      kf_cexp(twiddles+i, phase );
+   }
+#endif
+}
+
+int opus_fft_alloc_arch_c(kiss_fft_state *st) {
+   (void)st;
+   return 0;
+}
+
+/*
+ *
+ * Allocates all necessary storage space for the fft and ifft.
+ * The return value is a contiguous block of memory.  As such,
+ * It can be freed with free().
+ * */
+kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,
+                                        const kiss_fft_state *base, int arch)
+{
+    kiss_fft_state *st=NULL;
+    size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
+
+    if ( lenmem==NULL ) {
+        st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
+    }else{
+        if (mem != NULL && *lenmem >= memneeded)
+            st = (kiss_fft_state*)mem;
+        *lenmem = memneeded;
+    }
+    if (st) {
+        opus_int16 *bitrev;
+        kiss_twiddle_cpx *twiddles;
+
+        st->nfft=nfft;
+#ifdef FIXED_POINT
+        st->scale_shift = celt_ilog2(st->nfft);
+        if (st->nfft == 1<<st->scale_shift)
+           st->scale = Q15ONE;
+        else
+           st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
+#else
+        st->scale = 1.f/nfft;
+#endif
+        if (base != NULL)
+        {
+           st->twiddles = base->twiddles;
+           st->shift = 0;
+           while (st->shift < 32 && nfft<<st->shift != base->nfft)
+              st->shift++;
+           if (st->shift>=32)
+              goto fail;
+        } else {
+           st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
+           compute_twiddles(twiddles, nfft);
+           st->shift = -1;
+        }
+        if (!kf_factor(nfft,st->factors))
+        {
+           goto fail;
+        }
+
+        /* bitrev */
+        st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
+        if (st->bitrev==NULL)
+            goto fail;
+        compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
+
+        /* Initialize architecture specific fft parameters */
+        if (opus_fft_alloc_arch(st, arch))
+            goto fail;
+    }
+    return st;
+fail:
+    opus_fft_free(st, arch);
+    return NULL;
+}
+
+kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch)
+{
+   return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch);
+}
+
+void opus_fft_free_arch_c(kiss_fft_state *st) {
+   (void)st;
+}
+
+void opus_fft_free(const kiss_fft_state *cfg, int arch)
+{
+   if (cfg)
+   {
+      opus_fft_free_arch((kiss_fft_state *)cfg, arch);
+      free((opus_int16*)cfg->bitrev);
+      if (cfg->shift < 0)
+         free((kiss_twiddle_cpx*)cfg->twiddles);
+      free((kiss_fft_state*)cfg);
+   }
+}
+
+#endif /* CUSTOM_MODES */
+
+void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
+{
+    int m2, m;
+    int p;
+    int L;
+    int fstride[MAXFACTORS];
+    int i;
+    int shift;
+
+    /* st->shift can be -1 */
+    shift = st->shift>0 ? st->shift : 0;
+
+    fstride[0] = 1;
+    L=0;
+    do {
+       p = st->factors[2*L];
+       m = st->factors[2*L+1];
+       fstride[L+1] = fstride[L]*p;
+       L++;
+    } while(m!=1);
+    m = st->factors[2*L-1];
+    for (i=L-1;i>=0;i--)
+    {
+       if (i!=0)
+          m2 = st->factors[2*i-1];
+       else
+          m2 = 1;
+       switch (st->factors[2*i])
+       {
+       case 2:
+          kf_bfly2(fout, m, fstride[i]);
+          break;
+       case 4:
+          kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+ #ifndef RADIX_TWO_ONLY
+       case 3:
+          kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+       case 5:
+          kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          break;
+ #endif
+       }
+       m = m2;
+    }
+}
+
+void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+   int i;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
+   scale = st->scale;
+
+   celt_assert2 (fin != fout, "In-place FFT not supported");
+   /* Bit-reverse the input */
+   for (i=0;i<st->nfft;i++)
+   {
+      kiss_fft_cpx x = fin[i];
+      fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
+      fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
+   }
+   opus_fft_impl(st, fout);
+}
+
+
+void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+   int i;
+   celt_assert2 (fin != fout, "In-place FFT not supported");
+   /* Bit-reverse the input */
+   for (i=0;i<st->nfft;i++)
+      fout[st->bitrev[i]] = fin[i];
+   for (i=0;i<st->nfft;i++)
+      fout[i].i = -fout[i].i;
+   opus_fft_impl(st, fout);
+   for (i=0;i<st->nfft;i++)
+      fout[i].i = -fout[i].i;
+}
diff --git a/dnn/kiss_fft.h b/dnn/kiss_fft.h
new file mode 100644
index 00000000..acbc1927
--- /dev/null
+++ b/dnn/kiss_fft.h
@@ -0,0 +1,202 @@
+/*Copyright (c) 2003-2004, Mark Borgerding
+  Lots of modifications by Jean-Marc Valin
+  Copyright (c) 2005-2007, Xiph.Org Foundation
+  Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_H
+#define KISS_FFT_H
+
+#include <stdlib.h>
+#include <math.h>
+#include "arch.h"
+
+#include <stdlib.h>
+#define lpcnet_alloc(x) malloc(x)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef USE_SIMD
+# include <xmmintrin.h>
+# define kiss_fft_scalar __m128
+#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
+#else
+#define KISS_FFT_MALLOC lpcnet_alloc
+#endif
+
+#ifdef FIXED_POINT
+#include "arch.h"
+
+#  define kiss_fft_scalar opus_int32
+#  define kiss_twiddle_scalar opus_int16
+
+
+#else
+# ifndef kiss_fft_scalar
+/*  default is float */
+#   define kiss_fft_scalar float
+#   define kiss_twiddle_scalar float
+#   define KF_SUFFIX _celt_single
+# endif
+#endif
+
+typedef struct {
+    kiss_fft_scalar r;
+    kiss_fft_scalar i;
+}kiss_fft_cpx;
+
+typedef struct {
+   kiss_twiddle_scalar r;
+   kiss_twiddle_scalar i;
+}kiss_twiddle_cpx;
+
+#define MAXFACTORS 8
+/* e.g. an fft of length 128 has 4 factors
+ as far as kissfft is concerned
+ 4*4*4*2
+ */
+
+typedef struct arch_fft_state{
+   int is_supported;
+   void *priv;
+} arch_fft_state;
+
+typedef struct kiss_fft_state{
+    int nfft;
+    opus_val16 scale;
+#ifdef FIXED_POINT
+    int scale_shift;
+#endif
+    int shift;
+    opus_int16 factors[2*MAXFACTORS];
+    const opus_int16 *bitrev;
+    const kiss_twiddle_cpx *twiddles;
+    arch_fft_state *arch_fft;
+} kiss_fft_state;
+
+#if defined(HAVE_ARM_NE10)
+#include "arm/fft_arm.h"
+#endif
+
+/*typedef struct kiss_fft_state* kiss_fft_cfg;*/
+
+/**
+ *  opus_fft_alloc
+ *
+ *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
+ *
+ *  typical usage:      kiss_fft_cfg mycfg=opus_fft_alloc(1024,0,NULL,NULL);
+ *
+ *  The return value from fft_alloc is a cfg buffer used internally
+ *  by the fft routine or NULL.
+ *
+ *  If lenmem is NULL, then opus_fft_alloc will allocate a cfg buffer using malloc.
+ *  The returned value should be free()d when done to avoid memory leaks.
+ *
+ *  The state can be placed in a user supplied buffer 'mem':
+ *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
+ *      then the function places the cfg in mem and the size used in *lenmem
+ *      and returns mem.
+ *
+ *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
+ *      then the function returns NULL and places the minimum cfg
+ *      buffer size in *lenmem.
+ * */
+
+kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base, int arch);
+
+kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch);
+
+/**
+ * opus_fft(cfg,in_out_buf)
+ *
+ * Perform an FFT on a complex input buffer.
+ * for a forward FFT,
+ * fin should be  f[0] , f[1] , ... ,f[nfft-1]
+ * fout will be   F[0] , F[1] , ... ,F[nfft-1]
+ * Note that each element is complex and can be accessed like
+    f[k].r and f[k].i
+ * */
+void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+void opus_ifft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+
+void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
+void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
+
+void opus_fft_free(const kiss_fft_state *cfg, int arch);
+
+
+void opus_fft_free_arch_c(kiss_fft_state *st);
+int opus_fft_alloc_arch_c(kiss_fft_state *st);
+
+#if !defined(OVERRIDE_OPUS_FFT)
+/* Is run-time CPU detection enabled on this platform? */
+#if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10))
+
+extern int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(
+ kiss_fft_state *st);
+
+#define opus_fft_alloc_arch(_st, arch) \
+         ((*OPUS_FFT_ALLOC_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
+
+extern void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(
+ kiss_fft_state *st);
+#define opus_fft_free_arch(_st, arch) \
+         ((*OPUS_FFT_FREE_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
+
+extern void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+ const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
+#define opus_fft(_cfg, _fin, _fout, arch) \
+   ((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
+
+extern void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+ const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
+#define opus_ifft(_cfg, _fin, _fout, arch) \
+   ((*OPUS_IFFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
+
+#else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
+
+#define opus_fft_alloc_arch(_st, arch) \
+         ((void)(arch), opus_fft_alloc_arch_c(_st))
+
+#define opus_fft_free_arch(_st, arch) \
+         ((void)(arch), opus_fft_free_arch_c(_st))
+
+#define opus_fft(_cfg, _fin, _fout, arch) \
+         ((void)(arch), opus_fft_c(_cfg, _fin, _fout))
+
+#define opus_ifft(_cfg, _fin, _fout, arch) \
+         ((void)(arch), opus_ifft_c(_cfg, _fin, _fout))
+
+#endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
+#endif /* end if !defined(OVERRIDE_OPUS_FFT) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/dnn/lpcnet-uninstalled.pc.in b/dnn/lpcnet-uninstalled.pc.in
new file mode 100644
index 00000000..d315bf57
--- /dev/null
+++ b/dnn/lpcnet-uninstalled.pc.in
@@ -0,0 +1,13 @@
+# lpcnet uninstalled pkg-config file
+
+prefix=
+exec_prefix=
+libdir=${pcfiledir}/.libs
+includedir=${pcfiledir}/@top_srcdir@/include
+
+Name: lpcnet uninstalled
+Description: RNN-based noise suppression (not installed)
+Version: @PACKAGE_VERSION@
+Conflicts:
+Libs: ${libdir}/liblpcnet.la @lrintf_lib@
+Cflags: -I${includedir}
diff --git a/dnn/lpcnet.c b/dnn/lpcnet.c
new file mode 100644
index 00000000..eb179215
--- /dev/null
+++ b/dnn/lpcnet.c
@@ -0,0 +1,321 @@
+/* Copyright (c) 2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include "nnet_data.h"
+#include "nnet.h"
+#include "common.h"
+#include "arch.h"
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+
+#define PREEMPH 0.85f
+
+#define PDF_FLOOR 0.002
+
+#define FRAME_INPUT_SIZE (NB_FEATURES + EMBED_PITCH_OUT_SIZE)
+
+
+#if 0
+static void print_vector(float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++) printf("%f ", x[i]);
+    printf("\n");
+}
+#endif
+
+#ifdef END2END
+void rc2lpc(float *lpc, const float *rc)
+{
+  int i, j, k;
+  float tmp[LPC_ORDER];
+  float ntmp[LPC_ORDER] = {0.0};
+  RNN_COPY(tmp, rc, LPC_ORDER);
+  for(i = 0; i < LPC_ORDER ; i++)
+    { 
+        for(j = 0; j <= i-1; j++)
+        {
+            ntmp[j] = tmp[j] + tmp[i]*tmp[i - j - 1];
+        }
+        for(k = 0; k <= i-1; k++)
+        {
+            tmp[k] = ntmp[k];
+        }
+    }
+  for(i = 0; i < LPC_ORDER ; i++)
+  {
+    lpc[i] = tmp[i];
+  }
+}
+
+#endif
+
+void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features)
+{
+    NNetState *net;
+    float condition[FEATURE_DENSE2_OUT_SIZE];
+    float in[FRAME_INPUT_SIZE];
+    float conv1_out[FEATURE_CONV1_OUT_SIZE];
+    float conv2_out[FEATURE_CONV2_OUT_SIZE];
+    float dense1_out[FEATURE_DENSE1_OUT_SIZE];
+    int pitch;
+    float rc[LPC_ORDER];
+    /* Matches the Python code -- the 0.1 avoids rounding issues. */
+    pitch = (int)floor(.1 + 50*features[NB_BANDS]+100);
+    pitch = IMIN(255, IMAX(33, pitch));
+    net = &lpcnet->nnet;
+    RNN_COPY(in, features, NB_FEATURES);
+    compute_embedding(&lpcnet->model.embed_pitch, &in[NB_FEATURES], pitch);
+    compute_conv1d(&lpcnet->model.feature_conv1, conv1_out, net->feature_conv1_state, in);
+    if (lpcnet->frame_count < FEATURE_CONV1_DELAY) RNN_CLEAR(conv1_out, FEATURE_CONV1_OUT_SIZE);
+    compute_conv1d(&lpcnet->model.feature_conv2, conv2_out, net->feature_conv2_state, conv1_out);
+    if (lpcnet->frame_count < FEATURES_DELAY) RNN_CLEAR(conv2_out, FEATURE_CONV2_OUT_SIZE);
+    _lpcnet_compute_dense(&lpcnet->model.feature_dense1, dense1_out, conv2_out);
+    _lpcnet_compute_dense(&lpcnet->model.feature_dense2, condition, dense1_out);
+    RNN_COPY(rc, condition, LPC_ORDER);
+    _lpcnet_compute_dense(&lpcnet->model.gru_a_dense_feature, gru_a_condition, condition);
+    _lpcnet_compute_dense(&lpcnet->model.gru_b_dense_feature, gru_b_condition, condition);
+#ifdef END2END
+    rc2lpc(lpc, rc);
+#elif FEATURES_DELAY>0    
+    memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
+    memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
+    lpc_from_cepstrum(lpcnet->old_lpc[0], features);
+#else
+    lpc_from_cepstrum(lpc, features);
+#endif
+#ifdef LPC_GAMMA
+    lpc_weighting(lpc, LPC_GAMMA);
+#endif
+    if (lpcnet->frame_count < 1000) lpcnet->frame_count++;
+}
+
+void run_frame_network_deferred(LPCNetState *lpcnet, const float *features)
+{
+    int max_buffer_size = lpcnet->model.feature_conv1.kernel_size + lpcnet->model.feature_conv2.kernel_size - 2;
+    celt_assert(max_buffer_size <= MAX_FEATURE_BUFFER_SIZE);
+    if (lpcnet->feature_buffer_fill == max_buffer_size) {
+        RNN_MOVE(lpcnet->feature_buffer, &lpcnet->feature_buffer[NB_FEATURES],  (max_buffer_size-1)*NB_FEATURES);
+    } else {
+      lpcnet->feature_buffer_fill++;
+    }
+    RNN_COPY(&lpcnet->feature_buffer[(lpcnet->feature_buffer_fill-1)*NB_FEATURES], features, NB_FEATURES);
+}
+
+void run_frame_network_flush(LPCNetState *lpcnet)
+{
+    int i;
+    for (i=0;i<lpcnet->feature_buffer_fill;i++) {
+        float lpc[LPC_ORDER];
+        float gru_a_condition[3*GRU_A_STATE_SIZE];
+        float gru_b_condition[3*GRU_B_STATE_SIZE];
+        run_frame_network(lpcnet, gru_a_condition, gru_b_condition, lpc, &lpcnet->feature_buffer[i*NB_FEATURES]);
+    }
+    lpcnet->feature_buffer_fill = 0;
+}
+
+int run_sample_network(LPCNetState *lpcnet, const float *gru_a_condition, const float *gru_b_condition, int last_exc, int last_sig, int pred, const float *sampling_logit_table, kiss99_ctx *rng)
+{
+    NNetState *net;
+    float gru_a_input[3*GRU_A_STATE_SIZE];
+    float in_b[GRU_A_STATE_SIZE+FEATURE_DENSE2_OUT_SIZE];
+    float gru_b_input[3*GRU_B_STATE_SIZE];
+    net = &lpcnet->nnet;
+#if 1
+    compute_gru_a_input(gru_a_input, gru_a_condition, GRU_A_STATE_SIZE, &lpcnet->model.gru_a_embed_sig, last_sig, &lpcnet->model.gru_a_embed_pred, pred, &lpcnet->model.gru_a_embed_exc, last_exc);
+#else
+    RNN_COPY(gru_a_input, gru_a_condition, 3*GRU_A_STATE_SIZE);
+    accum_embedding(&lpcnet->model.gru_a_embed_sig, gru_a_input, last_sig);
+    accum_embedding(&lpcnet->model.gru_a_embed_pred, gru_a_input, pred);
+    accum_embedding(&lpcnet->model.gru_a_embed_exc, gru_a_input, last_exc);
+#endif
+    /*compute_gru3(&gru_a, net->gru_a_state, gru_a_input);*/
+    compute_sparse_gru(&lpcnet->model.sparse_gru_a, net->gru_a_state, gru_a_input);
+    RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE);
+    RNN_COPY(gru_b_input, gru_b_condition, 3*GRU_B_STATE_SIZE);
+    compute_gruB(&lpcnet->model.gru_b, gru_b_input, net->gru_b_state, in_b);
+    return sample_mdense(&lpcnet->model.dual_fc, net->gru_b_state, sampling_logit_table, rng);
+}
+
+LPCNET_EXPORT int lpcnet_get_size()
+{
+    return sizeof(LPCNetState);
+}
+
+LPCNET_EXPORT void lpcnet_reset(LPCNetState *lpcnet)
+{
+    const char* rng_string="LPCNet";
+    RNN_CLEAR((char*)&lpcnet->LPCNET_RESET_START,
+            sizeof(LPCNetState)-
+            ((char*)&lpcnet->LPCNET_RESET_START - (char*)lpcnet));
+    lpcnet->last_exc = lin2ulaw(0.f);
+    kiss99_srand(&lpcnet->rng, (const unsigned char *)rng_string, strlen(rng_string));
+}
+
+LPCNET_EXPORT int lpcnet_init(LPCNetState *lpcnet)
+{
+    int i;
+    int ret;
+    for (i=0;i<256;i++) {
+        float prob = .025f+.95f*i/255.f;
+        lpcnet->sampling_logit_table[i] = -log((1-prob)/prob);
+    }
+#ifndef USE_WEIGHTS_FILE
+    ret = init_lpcnet_model(&lpcnet->model, lpcnet_arrays);
+#else
+    ret = 0;
+#endif
+    lpcnet_reset(lpcnet);
+    celt_assert(ret == 0);
+    return ret;
+}
+
+LPCNET_EXPORT int lpcnet_load_model(LPCNetState *st, const unsigned char *data, int len) {
+  WeightArray *list;
+  int ret;
+  parse_weights(&list, data, len);
+  ret = init_lpcnet_model(&st->model, list);
+  free(list);
+  if (ret == 0) return 0;
+  else return -1;
+}
+
+
+LPCNET_EXPORT LPCNetState *lpcnet_create()
+{
+    LPCNetState *lpcnet;
+    lpcnet = (LPCNetState *)calloc(lpcnet_get_size(), 1);
+    lpcnet_init(lpcnet);
+    return lpcnet;
+}
+
+LPCNET_EXPORT void lpcnet_destroy(LPCNetState *lpcnet)
+{
+    free(lpcnet);
+}
+
+void lpcnet_reset_signal(LPCNetState *lpcnet)
+{
+    lpcnet->deemph_mem = 0;
+    lpcnet->last_exc = lin2ulaw(0.f);
+    RNN_CLEAR(lpcnet->last_sig, LPC_ORDER);
+    RNN_CLEAR(lpcnet->nnet.gru_a_state, GRU_A_STATE_SIZE);
+    RNN_CLEAR(lpcnet->nnet.gru_b_state, GRU_B_STATE_SIZE);
+}
+
+void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, short *output, int N, int preload)
+{
+    int i;
+
+    if (lpcnet->frame_count <= FEATURES_DELAY)
+    {
+        RNN_CLEAR(output, N);
+        return;
+    }
+    for (i=0;i<N;i++)
+    {
+        int j;
+        float pcm;
+        int exc;
+        int last_sig_ulaw;
+        int pred_ulaw;
+        float pred = 0;
+        for (j=0;j<LPC_ORDER;j++) pred -= lpcnet->last_sig[j]*lpcnet->lpc[j];
+        last_sig_ulaw = lin2ulaw(lpcnet->last_sig[0]);
+        pred_ulaw = lin2ulaw(pred);
+        exc = run_sample_network(lpcnet, lpcnet->gru_a_condition, lpcnet->gru_b_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw, lpcnet->sampling_logit_table, &lpcnet->rng);
+        if (i < preload) {
+          exc = lin2ulaw(output[i]-PREEMPH*lpcnet->deemph_mem - pred);
+          pcm = output[i]-PREEMPH*lpcnet->deemph_mem;
+        } else {
+          pcm = pred + ulaw2lin(exc);
+        }
+        RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
+        lpcnet->last_sig[0] = pcm;
+        lpcnet->last_exc = exc;
+        pcm += PREEMPH*lpcnet->deemph_mem;
+        lpcnet->deemph_mem = pcm;
+        if (pcm<-32767) pcm = -32767;
+        if (pcm>32767) pcm = 32767;
+        if (i >= preload) output[i] = (int)floor(.5 + pcm);
+    }
+}
+
+void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, short *output, int N, int preload)
+{
+    run_frame_network(lpcnet, lpcnet->gru_a_condition, lpcnet->gru_b_condition, lpcnet->lpc, features);
+    lpcnet_synthesize_tail_impl(lpcnet, output, N, preload);
+}
+
+LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features, short *output, int N) {
+    lpcnet_synthesize_impl(lpcnet, features, output, N, 0);
+}
+
+#ifndef OPUS_BUILD
+
+LPCNET_EXPORT int lpcnet_decoder_get_size()
+{
+  return sizeof(LPCNetDecState);
+}
+
+LPCNET_EXPORT int lpcnet_decoder_init(LPCNetDecState *st)
+{
+  memset(st, 0, lpcnet_decoder_get_size());
+  lpcnet_init(&st->lpcnet_state);
+  return 0;
+}
+
+LPCNET_EXPORT LPCNetDecState *lpcnet_decoder_create()
+{
+  LPCNetDecState *st;
+  st = malloc(lpcnet_decoder_get_size());
+  lpcnet_decoder_init(st);
+  return st;
+}
+
+LPCNET_EXPORT void lpcnet_decoder_destroy(LPCNetDecState *st)
+{
+  free(st);
+}
+
+LPCNET_EXPORT int lpcnet_decode(LPCNetDecState *st, const unsigned char *buf, short *pcm)
+{
+  int k;
+  float features[4][NB_TOTAL_FEATURES];
+  decode_packet(features, st->vq_mem, buf);
+  for (k=0;k<4;k++) {
+    lpcnet_synthesize(&st->lpcnet_state, features[k], &pcm[k*FRAME_SIZE], FRAME_SIZE);
+  }
+  return 0;
+}
+
+#endif
diff --git a/dnn/lpcnet.pc.in b/dnn/lpcnet.pc.in
new file mode 100644
index 00000000..d348adc8
--- /dev/null
+++ b/dnn/lpcnet.pc.in
@@ -0,0 +1,14 @@
+# lpcnet installed pkg-config file
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: lpcnet
+Description: RNN-based noise suppression
+Version: @PACKAGE_VERSION@
+Conflicts:
+Libs: -L${libdir} -llpcnet
+Libs.private: @lrintf_lib@
+Cflags: -I${includedir}/
diff --git a/dnn/lpcnet_dec.c b/dnn/lpcnet_dec.c
new file mode 100644
index 00000000..d8df4d1c
--- /dev/null
+++ b/dnn/lpcnet_dec.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2017-2019 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "kiss_fft.h"
+#include "common.h"
+#include <math.h>
+#include "freq.h"
+#include "pitch.h"
+#include "arch.h"
+#include <assert.h>
+#include "lpcnet_private.h"
+#include "lpcnet.h"
+
+
+typedef struct {
+    int byte_pos;
+    int bit_pos;
+    int max_bytes;
+    const unsigned char *chars;
+} unpacker;
+
+void bits_unpacker_init(unpacker *bits, const unsigned char *buf, int size) {
+  bits->byte_pos = 0;
+  bits->bit_pos = 0;
+  bits->max_bytes = size;
+  bits->chars = buf;
+}
+
+unsigned int bits_unpack(unpacker *bits, int nb_bits) {
+  unsigned int d=0;
+  while(nb_bits)
+  {
+    if (bits->byte_pos == bits->max_bytes) {
+      fprintf(stderr, "something went horribly wrong\n");
+      return 0;
+    }
+    d<<=1;
+    d |= (bits->chars[bits->byte_pos]>>(BITS_PER_CHAR-1 - bits->bit_pos))&1;
+    bits->bit_pos++;
+    if (bits->bit_pos==BITS_PER_CHAR)
+    {
+      bits->bit_pos=0;
+      bits->byte_pos++;
+    }
+    nb_bits--;
+  }
+  return d;
+}
+
+#ifndef OPUS_BUILD
+void decode_packet(float features[4][NB_TOTAL_FEATURES], float *vq_mem, const unsigned char buf[8])
+{
+  int c0_id;
+  int main_pitch;
+  int modulation;
+  int corr_id;
+  int vq_end[3];
+  int vq_mid;
+  int interp_id;
+  
+  int i;
+  int sub;
+  int voiced = 1;
+  float frame_corr;
+  float sign;
+  unpacker bits;
+  
+  bits_unpacker_init(&bits, buf, 8);
+  c0_id = bits_unpack(&bits, 7);
+  main_pitch = bits_unpack(&bits, 6);
+  modulation = bits_unpack(&bits, 3);
+  corr_id = bits_unpack(&bits, 2);
+  vq_end[0] = bits_unpack(&bits, 10);
+  vq_end[1] = bits_unpack(&bits, 10);
+  vq_end[2] = bits_unpack(&bits, 10);
+  vq_mid = bits_unpack(&bits, 13);
+  interp_id = bits_unpack(&bits, 3);
+  /*fprintf(stdout, "%d %d %d %d %d %d %d %d %d\n", c0_id, main_pitch, modulation, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);*/
+
+  
+  for (i=0;i<4;i++) RNN_CLEAR(&features[i][0], NB_TOTAL_FEATURES);
+
+  modulation -= 4;
+  if (modulation==-4) {
+    voiced = 0;
+    modulation = 0;
+  }
+  if (voiced) {
+    frame_corr = 0.3875f + .175f*corr_id;
+  } else {
+    frame_corr = 0.0375f + .075f*corr_id;
+  }
+  for (sub=0;sub<4;sub++) {
+    float p = pow(2.f, main_pitch/21.)*PITCH_MIN_PERIOD;
+    p *= 1.f + modulation/16.f/7.f*(2*sub-3);
+    p = MIN16(255, MAX16(33, p));
+    features[sub][NB_BANDS] = .02f*(p-100.f);
+    features[sub][NB_BANDS + 1] = frame_corr-.5f;
+  }
+  
+  features[3][0] = (c0_id-64)/4.f;
+  for (i=0;i<NB_BANDS_1;i++) {
+    features[3][i+1] = ceps_codebook1[vq_end[0]*NB_BANDS_1 + i] + ceps_codebook2[vq_end[1]*NB_BANDS_1 + i] + ceps_codebook3[vq_end[2]*NB_BANDS_1 + i];
+  }
+
+  sign = 1;
+  if (vq_mid >= 4096) {
+    vq_mid -= 4096;
+    sign = -1;
+  }
+  for (i=0;i<NB_BANDS;i++) {
+    features[1][i] = sign*ceps_codebook_diff4[vq_mid*NB_BANDS + i];
+  }
+  if ((vq_mid&MULTI_MASK) < 2) {
+    for (i=0;i<NB_BANDS;i++) features[1][i] += .5f*(vq_mem[i] + features[3][i]);
+  } else if ((vq_mid&MULTI_MASK) == 2) {
+    for (i=0;i<NB_BANDS;i++) features[1][i] += vq_mem[i];
+  } else {
+    for (i=0;i<NB_BANDS;i++) features[1][i] += features[3][i];
+  }
+  
+  perform_double_interp(features, vq_mem, interp_id);
+
+  RNN_COPY(vq_mem, &features[3][0], NB_BANDS);
+}
+#endif
diff --git a/dnn/lpcnet_demo.c b/dnn/lpcnet_demo.c
new file mode 100644
index 00000000..cf62ad3d
--- /dev/null
+++ b/dnn/lpcnet_demo.c
@@ -0,0 +1,271 @@
+/* Copyright (c) 2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include "arch.h"
+#include "lpcnet.h"
+#include "freq.h"
+
+#ifdef USE_WEIGHTS_FILE
+# if __unix__
+#  include <fcntl.h>
+#  include <sys/mman.h>
+#  include <unistd.h>
+#  include <sys/stat.h>
+/* When available, mmap() is preferable to reading the file, as it leads to
+   better resource utilization, especially if multiple processes are using the same
+   file (mapping will be shared in cache). */
+unsigned char *load_blob(const char *filename, int *len) {
+  int fd;
+  unsigned char *data;
+  struct stat st;
+  stat(filename, &st);
+  *len = st.st_size;
+  fd = open(filename, O_RDONLY);
+  data = mmap(NULL, *len, PROT_READ, MAP_SHARED, fd, 0);
+  close(fd);
+  return data;
+}
+void free_blob(unsigned char *blob, int len) {
+  munmap(blob, len);
+}
+# else
+unsigned char *load_blob(const char *filename, int *len) {
+  FILE *file;
+  unsigned char *data;
+  file = fopen(filename, "r");
+  fseek(file, 0L, SEEK_END);
+  *len = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+  if (*len <= 0) return NULL;
+  data = malloc(*len);
+  *len = fread(data, 1, *len, file);
+  return data;
+}
+void free_blob(unsigned char *blob, int len) {
+  free(blob);
+  (void)len;
+}
+# endif
+#endif
+
+#define MODE_ENCODE 0
+#define MODE_DECODE 1
+#define MODE_FEATURES 2
+#define MODE_SYNTHESIS 3
+#define MODE_PLC 4
+#define MODE_ADDLPC 5
+
+void usage(void) {
+    fprintf(stderr, "usage: lpcnet_demo -encode <input.pcm> <compressed.lpcnet>\n");
+    fprintf(stderr, "       lpcnet_demo -decode <compressed.lpcnet> <output.pcm>\n");
+    fprintf(stderr, "       lpcnet_demo -features <input.pcm> <features.f32>\n");
+    fprintf(stderr, "       lpcnet_demo -synthesis <features.f32> <output.pcm>\n");
+    fprintf(stderr, "       lpcnet_demo -plc <plc_options> <percent> <input.pcm> <output.pcm>\n");
+    fprintf(stderr, "       lpcnet_demo -plc_file <plc_options> <percent> <input.pcm> <output.pcm>\n");
+    fprintf(stderr, "       lpcnet_demo -addlpc <features_without_lpc.f32> <features_with_lpc.lpc>\n\n");
+    fprintf(stderr, "  plc_options:\n");
+    fprintf(stderr, "       causal:       normal (causal) PLC\n");
+    fprintf(stderr, "       causal_dc:    normal (causal) PLC with DC offset compensation\n");
+    fprintf(stderr, "       noncausal:    non-causal PLC\n");
+    fprintf(stderr, "       noncausal_dc: non-causal PLC with DC offset compensation\n");
+    exit(1);
+}
+
+int main(int argc, char **argv) {
+    int mode;
+    int plc_percent=0;
+    FILE *fin, *fout;
+    FILE *plc_file = NULL;
+    const char *plc_options;
+    int plc_flags=-1;
+#ifdef USE_WEIGHTS_FILE
+    int len;
+    unsigned char *data;
+    const char *filename = "weights_blob.bin";
+#endif
+    if (argc < 4) usage();
+    if (strcmp(argv[1], "-encode") == 0) mode=MODE_ENCODE;
+    else if (strcmp(argv[1], "-decode") == 0) mode=MODE_DECODE;
+    else if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
+    else if (strcmp(argv[1], "-synthesis") == 0) mode=MODE_SYNTHESIS;
+    else if (strcmp(argv[1], "-plc") == 0) {
+        mode=MODE_PLC;
+        plc_options = argv[2];
+        plc_percent = atoi(argv[3]);
+        argv+=2;
+        argc-=2;
+    } else if (strcmp(argv[1], "-plc_file") == 0) {
+        mode=MODE_PLC;
+        plc_options = argv[2];
+        plc_file = fopen(argv[3], "r");
+        if (!plc_file) {
+            fprintf(stderr, "Can't open %s\n", argv[3]);
+            exit(1);
+        }
+        argv+=2;
+        argc-=2;
+    } else if (strcmp(argv[1], "-addlpc") == 0){
+        mode=MODE_ADDLPC;
+    } else {
+        usage();
+    }
+    if (mode == MODE_PLC) {
+        if (strcmp(plc_options, "causal")==0) plc_flags = LPCNET_PLC_CAUSAL;
+        else if (strcmp(plc_options, "causal_dc")==0) plc_flags = LPCNET_PLC_CAUSAL | LPCNET_PLC_DC_FILTER;
+        else if (strcmp(plc_options, "noncausal")==0) plc_flags = LPCNET_PLC_NONCAUSAL;
+        else if (strcmp(plc_options, "noncausal_dc")==0) plc_flags = LPCNET_PLC_NONCAUSAL | LPCNET_PLC_DC_FILTER;
+        else usage();
+    }
+    if (argc != 4) usage();
+    fin = fopen(argv[2], "rb");
+    if (fin == NULL) {
+        fprintf(stderr, "Can't open %s\n", argv[2]);
+        exit(1);
+    }
+
+    fout = fopen(argv[3], "wb");
+    if (fout == NULL) {
+        fprintf(stderr, "Can't open %s\n", argv[3]);
+        exit(1);
+    }
+#ifdef USE_WEIGHTS_FILE
+    data = load_blob(filename, &len);
+#endif
+    if (mode == MODE_ENCODE) {
+        LPCNetEncState *net;
+        net = lpcnet_encoder_create();
+        while (1) {
+            unsigned char buf[LPCNET_COMPRESSED_SIZE];
+            short pcm[LPCNET_PACKET_SAMPLES];
+            size_t ret;
+            ret = fread(pcm, sizeof(pcm[0]), LPCNET_PACKET_SAMPLES, fin);
+            if (feof(fin) || ret != LPCNET_PACKET_SAMPLES) break;
+            lpcnet_encode(net, pcm, buf);
+            fwrite(buf, 1, LPCNET_COMPRESSED_SIZE, fout);
+        }
+        lpcnet_encoder_destroy(net);
+    } else if (mode == MODE_DECODE) {
+        LPCNetDecState *net;
+        net = lpcnet_decoder_create();
+        while (1) {
+            unsigned char buf[LPCNET_COMPRESSED_SIZE];
+            short pcm[LPCNET_PACKET_SAMPLES];
+            size_t ret;
+            ret = fread(buf, sizeof(buf[0]), LPCNET_COMPRESSED_SIZE, fin);
+            if (feof(fin) || ret != LPCNET_COMPRESSED_SIZE) break;
+            lpcnet_decode(net, buf, pcm);
+            fwrite(pcm, sizeof(pcm[0]), LPCNET_PACKET_SAMPLES, fout);
+        }
+        lpcnet_decoder_destroy(net);
+    } else if (mode == MODE_FEATURES) {
+        LPCNetEncState *net;
+        net = lpcnet_encoder_create();
+        while (1) {
+            float features[NB_TOTAL_FEATURES];
+            short pcm[LPCNET_FRAME_SIZE];
+            size_t ret;
+            ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);
+            if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;
+            lpcnet_compute_single_frame_features(net, pcm, features);
+            fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);
+        }
+        lpcnet_encoder_destroy(net);
+    } else if (mode == MODE_SYNTHESIS) {
+        LPCNetState *net;
+        net = lpcnet_create();
+#ifdef USE_WEIGHTS_FILE
+        lpcnet_load_model(net, data, len);
+#endif
+        while (1) {
+            float in_features[NB_TOTAL_FEATURES];
+            float features[NB_FEATURES];
+            short pcm[LPCNET_FRAME_SIZE];
+            size_t ret;
+            ret = fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
+            if (feof(fin) || ret != NB_TOTAL_FEATURES) break;
+            RNN_COPY(features, in_features, NB_FEATURES);
+            lpcnet_synthesize(net, features, pcm, LPCNET_FRAME_SIZE);
+            fwrite(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fout);
+        }
+        lpcnet_destroy(net);
+    } else if (mode == MODE_PLC) {
+        short pcm[FRAME_SIZE];
+        int count=0;
+        int loss=0;
+        int skip=0, extra=0;
+        LPCNetPLCState *net;
+        if ((plc_flags&0x3) == LPCNET_PLC_NONCAUSAL) skip=extra=80;
+        net = lpcnet_plc_create(plc_flags);
+#ifdef USE_WEIGHTS_FILE
+        lpcnet_plc_load_model(net, data, len);
+#endif
+        while (1) {
+            size_t ret;
+            ret = fread(pcm, sizeof(pcm[0]), FRAME_SIZE, fin);
+            if (feof(fin) || ret != FRAME_SIZE) break;
+            if (count % 2 == 0) {
+              if (plc_file != NULL) fscanf(plc_file, "%d", &loss);
+              else loss = rand() < RAND_MAX*(float)plc_percent/100.f;
+            }
+            if (loss) lpcnet_plc_conceal(net, pcm);
+            else lpcnet_plc_update(net, pcm);
+            fwrite(&pcm[skip], sizeof(pcm[0]), FRAME_SIZE-skip, fout);
+            skip = 0;
+            count++;
+        }
+        if (extra) {
+          lpcnet_plc_conceal(net, pcm);
+          fwrite(pcm, sizeof(pcm[0]), extra, fout);
+        }
+        lpcnet_plc_destroy(net);
+    } else if (mode == MODE_ADDLPC) {
+        float features[36];
+        size_t ret;
+
+        while (1) {
+            ret = fread(features, sizeof(features[0]), 36, fin);
+            if (ret != 36 || feof(fin)) break;
+            lpc_from_cepstrum(&features[20], &features[0]);
+            fwrite(features, sizeof(features[0]), 36, fout);
+        }
+
+    } else {
+        fprintf(stderr, "unknown action\n");
+    }
+    fclose(fin);
+    fclose(fout);
+    if (plc_file) fclose(plc_file);
+#ifdef USE_WEIGHTS_FILE
+    free_blob(data, len);
+#endif
+    return 0;
+}
diff --git a/dnn/lpcnet_enc.c b/dnn/lpcnet_enc.c
new file mode 100644
index 00000000..3854faf0
--- /dev/null
+++ b/dnn/lpcnet_enc.c
@@ -0,0 +1,933 @@
+/* Copyright (c) 2017-2019 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef OPUS_BUILD
+#define celt_pitch_xcorr celt_pitch_xcorr_c
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "kiss_fft.h"
+#include "common.h"
+#include <math.h>
+#include "freq.h"
+#include "pitch.h"
+#include "arch.h"
+#include <assert.h>
+#include "lpcnet_private.h"
+#include "lpcnet.h"
+
+#ifndef OPUS_BUILD
+
+#define SURVIVORS 5
+
+
+void vq_quantize_mbest(const float *codebook, int nb_entries, const float *x, int ndim, int mbest, float *dist, int *index)
+{
+  int i, j;
+  for (i=0;i<mbest;i++) dist[i] = 1e15f;
+  
+  for (i=0;i<nb_entries;i++)
+  {
+    float d=0;
+    for (j=0;j<ndim;j++)
+      d += (x[j]-codebook[i*ndim+j])*(x[j]-codebook[i*ndim+j]);
+    if (d<dist[mbest-1])
+    {
+      int pos;
+      for (j=0;j<mbest-1;j++) {
+        if (d < dist[j]) break;
+      }
+      pos = j;
+      for (j=mbest-1;j>=pos+1;j--) {
+        dist[j] = dist[j-1];
+        index[j] = index[j-1];
+      }
+      dist[pos] = d;
+      index[pos] = i;
+    }
+  }
+}
+
+
+int vq_quantize(const float *codebook, int nb_entries, const float *x, int ndim, float *dist_out)
+{
+  int i, j;
+  float min_dist = 1e15f;
+  int nearest = 0;
+  
+  for (i=0;i<nb_entries;i++)
+  {
+    float dist=0;
+    for (j=0;j<ndim;j++)
+      dist += (x[j]-codebook[i*ndim+j])*(x[j]-codebook[i*ndim+j]);
+    if (dist<min_dist)
+    {
+      min_dist = dist;
+      nearest = i;
+    }
+  }
+  if (dist_out)
+    *dist_out = min_dist;
+  return nearest;
+}
+
+int quantize_2stage(float *x)
+{
+    int i;
+    int id, id2, id3;
+    float ref[NB_BANDS_1];
+    RNN_COPY(ref, x, NB_BANDS_1);
+    id = vq_quantize(ceps_codebook1, 1024, x, NB_BANDS_1, NULL);
+    for (i=0;i<NB_BANDS_1;i++) {
+        x[i] -= ceps_codebook1[id*NB_BANDS_1 + i];
+    }
+    id2 = vq_quantize(ceps_codebook2, 1024, x, NB_BANDS_1, NULL);
+    for (i=0;i<NB_BANDS_1;i++) {
+        x[i] -= ceps_codebook2[id2*NB_BANDS_1 + i];
+    }
+    id3 = vq_quantize(ceps_codebook3, 1024, x, NB_BANDS_1, NULL);
+    for (i=0;i<NB_BANDS_1;i++) {
+        x[i] = ceps_codebook1[id*NB_BANDS_1 + i] + ceps_codebook2[id2*NB_BANDS_1 + i] + ceps_codebook3[id3*NB_BANDS_1 + i];
+    }
+    if (0) {
+        float err = 0;
+        for (i=0;i<NB_BANDS_1;i++) {
+            err += (x[i]-ref[i])*(x[i]-ref[i]);
+        }
+        printf("%f\n", sqrt(err/NB_BANDS));
+    }
+    
+    return id;
+}
+
+
+int quantize_3stage_mbest(float *x, int entry[3])
+{
+    int i, k;
+    int id, id2, id3;
+    float ref[NB_BANDS_1];
+    int curr_index[SURVIVORS];
+    int index1[SURVIVORS][3];
+    int index2[SURVIVORS][3];
+    int index3[SURVIVORS][3];
+    float curr_dist[SURVIVORS];
+    float glob_dist[SURVIVORS];
+    RNN_COPY(ref, x, NB_BANDS_1);
+    vq_quantize_mbest(ceps_codebook1, 1024, x, NB_BANDS_1, SURVIVORS, curr_dist, curr_index);
+    for (k=0;k<SURVIVORS;k++) {
+      index1[k][0] = curr_index[k];
+    }
+    for (k=0;k<SURVIVORS;k++) {
+      int m;
+      float diff[NB_BANDS_1];
+      for (i=0;i<NB_BANDS_1;i++) {
+        diff[i] = x[i] - ceps_codebook1[index1[k][0]*NB_BANDS_1 + i];
+      }
+      vq_quantize_mbest(ceps_codebook2, 1024, diff, NB_BANDS_1, SURVIVORS, curr_dist, curr_index);
+      if (k==0) {
+        for (m=0;m<SURVIVORS;m++) {
+          index2[m][0] = index1[k][0];
+          index2[m][1] = curr_index[m];
+          glob_dist[m] = curr_dist[m];
+        }
+        /*printf("%f ", glob_dist[0]);*/
+      } else if (curr_dist[0] < glob_dist[SURVIVORS-1]) {
+        int pos;
+        m=0;
+        for (pos=0;pos<SURVIVORS;pos++) {
+          if (curr_dist[m] < glob_dist[pos]) {
+            int j;
+            for (j=SURVIVORS-1;j>=pos+1;j--) {
+              glob_dist[j] = glob_dist[j-1];
+              index2[j][0] = index2[j-1][0];
+              index2[j][1] = index2[j-1][1];
+            }
+            glob_dist[pos] = curr_dist[m];
+            index2[pos][0] = index1[k][0];
+            index2[pos][1] = curr_index[m];
+            m++;
+          }
+        }
+      }
+    }
+    for (k=0;k<SURVIVORS;k++) {
+      int m;
+      float diff[NB_BANDS_1];
+      for (i=0;i<NB_BANDS_1;i++) {
+        diff[i] = x[i] - ceps_codebook1[index2[k][0]*NB_BANDS_1 + i] - ceps_codebook2[index2[k][1]*NB_BANDS_1 + i];
+      }
+      vq_quantize_mbest(ceps_codebook3, 1024, diff, NB_BANDS_1, SURVIVORS, curr_dist, curr_index);
+      if (k==0) {
+        for (m=0;m<SURVIVORS;m++) {
+          index3[m][0] = index2[k][0];
+          index3[m][1] = index2[k][1];
+          index3[m][2] = curr_index[m];
+          glob_dist[m] = curr_dist[m];
+        }
+        /*printf("%f ", glob_dist[0]);*/
+      } else if (curr_dist[0] < glob_dist[SURVIVORS-1]) {
+        int pos;
+        m=0;
+        for (pos=0;pos<SURVIVORS;pos++) {
+          if (curr_dist[m] < glob_dist[pos]) {
+            int j;
+            for (j=SURVIVORS-1;j>=pos+1;j--) {
+              glob_dist[j] = glob_dist[j-1];
+              index3[j][0] = index3[j-1][0];
+              index3[j][1] = index3[j-1][1];
+              index3[j][2] = index3[j-1][2];
+            }
+            glob_dist[pos] = curr_dist[m];
+            index3[pos][0] = index2[k][0];
+            index3[pos][1] = index2[k][1];
+            index3[pos][2] = curr_index[m];
+            m++;
+          }
+        }
+      }
+    }
+    entry[0] = id = index3[0][0];
+    entry[1] = id2 = index3[0][1];
+    entry[2] = id3 = index3[0][2];
+    /*printf("%f ", glob_dist[0]);*/
+    for (i=0;i<NB_BANDS_1;i++) {
+        x[i] -= ceps_codebook1[id*NB_BANDS_1 + i];
+    }
+    for (i=0;i<NB_BANDS_1;i++) {
+        x[i] -= ceps_codebook2[id2*NB_BANDS_1 + i];
+    }
+    /*id3 = vq_quantize(ceps_codebook3, 1024, x, NB_BANDS_1, NULL);*/
+    for (i=0;i<NB_BANDS_1;i++) {
+        x[i] = ceps_codebook1[id*NB_BANDS_1 + i] + ceps_codebook2[id2*NB_BANDS_1 + i] + ceps_codebook3[id3*NB_BANDS_1 + i];
+    }
+    if (0) {
+        float err = 0;
+        for (i=0;i<NB_BANDS_1;i++) {
+            err += (x[i]-ref[i])*(x[i]-ref[i]);
+        }
+        printf("%f\n", sqrt(err/NB_BANDS));
+    }
+    
+    return id;
+}
+
+static int find_nearest_multi(const float *codebook, int nb_entries, const float *x, int ndim, float *dist_out, int sign)
+{
+  int i, j;
+  float min_dist = 1e15f;
+  int nearest = 0;
+
+  for (i=0;i<nb_entries;i++)
+  {
+    int offset;
+    float dist=0;
+    offset = (i&MULTI_MASK)*ndim;
+    for (j=0;j<ndim;j++)
+      dist += (x[offset+j]-codebook[i*ndim+j])*(x[offset+j]-codebook[i*ndim+j]);
+    if (dist<min_dist)
+    {
+      min_dist = dist;
+      nearest = i;
+    }
+  }
+  if (sign) {
+    for (i=0;i<nb_entries;i++)
+    {
+      int offset;
+      float dist=0;
+      offset = (i&MULTI_MASK)*ndim;
+      for (j=0;j<ndim;j++)
+        dist += (x[offset+j]+codebook[i*ndim+j])*(x[offset+j]+codebook[i*ndim+j]);
+      if (dist<min_dist)
+      {
+        min_dist = dist;
+        nearest = i+nb_entries;
+      }
+    }
+  }
+  if (dist_out)
+    *dist_out = min_dist;
+  return nearest;
+}
+
+
+int quantize_diff(float *x, float *left, float *right, float *codebook, int bits, int sign, int *entry)
+{
+    int i;
+    int nb_entries;
+    int id;
+    float ref[NB_BANDS];
+    float pred[4*NB_BANDS];
+    float target[4*NB_BANDS];
+    float s = 1;
+    nb_entries = 1<<bits;
+    RNN_COPY(ref, x, NB_BANDS);
+    for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5f*(left[i] + right[i]);
+    for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = left[i];
+    for (i=0;i<NB_BANDS;i++) pred[3*NB_BANDS+i] = right[i];
+    for (i=0;i<4*NB_BANDS;i++) target[i] = x[i%NB_BANDS] - pred[i];
+
+    id = find_nearest_multi(codebook, nb_entries, target, NB_BANDS, NULL, sign);
+    *entry = id;
+    if (id >= 1<<bits) {
+      s = -1;
+      id -= (1<<bits);
+    }
+    for (i=0;i<NB_BANDS;i++) {
+      x[i] = pred[(id&MULTI_MASK)*NB_BANDS + i] + s*codebook[id*NB_BANDS + i];
+    }
+    /*printf("%d %f ", id&MULTI_MASK, s);*/
+    if (0) {
+        float err = 0;
+        for (i=0;i<NB_BANDS;i++) {
+            err += (x[i]-ref[i])*(x[i]-ref[i]);
+        }
+        printf("%f\n", sqrt(err/NB_BANDS));
+    }
+    
+    return id;
+}
+
+int interp_search(const float *x, const float *left, const float *right, float *dist_out)
+{
+    int i, k;
+    float min_dist = 1e15f;
+    int best_pred = 0;
+    float pred[4*NB_BANDS];
+    for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5f*(left[i] + right[i]);
+    for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = left[i];
+    for (i=0;i<NB_BANDS;i++) pred[3*NB_BANDS+i] = right[i];
+
+    for (k=1;k<4;k++) {
+      float dist = 0;
+      for (i=0;i<NB_BANDS;i++) dist += (x[i] - pred[k*NB_BANDS+i])*(x[i] - pred[k*NB_BANDS+i]);
+      dist_out[k-1] = dist;
+      if (dist < min_dist) {
+        min_dist = dist;
+        best_pred = k;
+      }
+    }
+    return best_pred - 1;
+}
+
+
+void interp_diff(float *x, float *left, float *right, float *codebook, int bits, int sign)
+{
+    int i, k;
+    float min_dist = 1e15f;
+    int best_pred = 0;
+    float ref[NB_BANDS];
+    float pred[4*NB_BANDS];
+    (void)sign;
+    (void)codebook;
+    (void)bits;
+    RNN_COPY(ref, x, NB_BANDS);
+    for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5f*(left[i] + right[i]);
+    for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = left[i];
+    for (i=0;i<NB_BANDS;i++) pred[3*NB_BANDS+i] = right[i];
+
+    for (k=1;k<4;k++) {
+      float dist = 0;
+      for (i=0;i<NB_BANDS;i++) dist += (x[i] - pred[k*NB_BANDS+i])*(x[i] - pred[k*NB_BANDS+i]);
+      if (dist < min_dist) {
+        min_dist = dist;
+        best_pred = k;
+      }
+    }
+    /*printf("%d ", best_pred);*/
+    for (i=0;i<NB_BANDS;i++) {
+      x[i] = pred[best_pred*NB_BANDS + i];
+    }
+    if (0) {
+        float err = 0;
+        for (i=0;i<NB_BANDS;i++) {
+            err += (x[i]-ref[i])*(x[i]-ref[i]);
+        }
+        printf("%f\n", sqrt(err/NB_BANDS));
+    }
+}
+
+int double_interp_search(float features[4][NB_TOTAL_FEATURES], const float *mem) {
+    int i, j;
+    int best_id=0;
+    float min_dist = 1e15f;
+    float dist[2][3];
+    interp_search(features[0], mem, features[1], dist[0]);
+    interp_search(features[2], features[1], features[3], dist[1]);
+    for (i=0;i<3;i++) {
+        for (j=0;j<3;j++) {
+            float d;
+            int id;
+            id = 3*i + j;
+            d = dist[0][i] + dist[1][j];
+            if (d < min_dist && id != FORBIDDEN_INTERP) {
+                min_dist = d;
+                best_id = id;
+            }
+        }
+    }
+    /*printf("%d %d %f    %d %f\n", id0, id1, dist[0][id0] + dist[1][id1], best_id, min_dist);*/
+    return best_id - (best_id >= FORBIDDEN_INTERP);
+}
+
+
+void perform_interp_relaxation(float features[4][NB_TOTAL_FEATURES], const float *mem) {
+    int id0, id1;
+    int best_id;
+    int i;
+    float count, count_1;
+    best_id = double_interp_search(features, mem);
+    best_id += (best_id >= FORBIDDEN_INTERP);
+    id0 = best_id / 3;
+    id1 = best_id % 3;
+    count = 1;
+    if (id0 != 1) {
+        float t = (id0==0) ? .5f : 1.f;
+        for (i=0;i<NB_BANDS;i++) features[1][i] += t*features[0][i];
+        count += t;
+    }
+    if (id1 != 2) {
+        float t = (id1==0) ? .5f : 1.f;
+        for (i=0;i<NB_BANDS;i++) features[1][i] += t*features[2][i];
+        count += t;
+    }
+    count_1 = 1.f/count;
+    for (i=0;i<NB_BANDS;i++) features[1][i] *= count_1;
+}
+
+typedef struct {
+    int byte_pos;
+    int bit_pos;
+    int max_bytes;
+    unsigned char *chars;
+} packer;
+
+
+void bits_packer_init(packer *bits, unsigned char *buf, int size) {
+  bits->byte_pos = 0;
+  bits->bit_pos = 0;
+  bits->max_bytes = size;
+  bits->chars = buf;
+  RNN_CLEAR(buf, size);
+}
+
+void bits_pack(packer *bits, unsigned int data, int nb_bits) {
+  while(nb_bits)
+  {
+    int bit;
+    if (bits->byte_pos == bits->max_bytes) {
+      fprintf(stderr, "something went horribly wrong\n");
+      return;
+    }
+    bit = (data>>(nb_bits-1))&1;
+    bits->chars[bits->byte_pos] |= bit<<(BITS_PER_CHAR-1-bits->bit_pos);
+    bits->bit_pos++;
+
+    if (bits->bit_pos==BITS_PER_CHAR)
+    {
+      bits->bit_pos=0;
+      bits->byte_pos++;
+      if (bits->byte_pos < bits->max_bytes) bits->chars[bits->byte_pos] = 0;
+    }
+    nb_bits--;
+  }
+}
+
+#endif
+
+LPCNET_EXPORT int lpcnet_encoder_get_size() {
+  return sizeof(LPCNetEncState);
+}
+
+LPCNET_EXPORT int lpcnet_encoder_init(LPCNetEncState *st) {
+  memset(st, 0, sizeof(*st));
+  st->exc_mem = lin2ulaw(0.f);
+  return 0;
+}
+
+LPCNET_EXPORT LPCNetEncState *lpcnet_encoder_create() {
+  LPCNetEncState *st;
+  st = malloc(lpcnet_encoder_get_size());
+  lpcnet_encoder_init(st);
+  return st;
+}
+
+LPCNET_EXPORT void lpcnet_encoder_destroy(LPCNetEncState *st) {
+  free(st);
+}
+
+static void frame_analysis(LPCNetEncState *st, kiss_fft_cpx *X, float *Ex, const float *in) {
+  float x[WINDOW_SIZE];
+  RNN_COPY(x, st->analysis_mem, OVERLAP_SIZE);
+  RNN_COPY(&x[OVERLAP_SIZE], in, FRAME_SIZE);
+  RNN_COPY(st->analysis_mem, &in[FRAME_SIZE-OVERLAP_SIZE], OVERLAP_SIZE);
+  apply_window(x);
+  forward_transform(X, x);
+  lpcn_compute_band_energy(Ex, X);
+}
+
+void compute_frame_features(LPCNetEncState *st, const float *in) {
+  float aligned_in[FRAME_SIZE];
+  int i;
+  float E = 0;
+  float Ly[NB_BANDS];
+  float follow, logMax;
+  kiss_fft_cpx X[FREQ_SIZE];
+  float Ex[NB_BANDS];
+  float xcorr[PITCH_MAX_PERIOD];
+  float ener0;
+  int sub;
+  float ener;
+  RNN_COPY(aligned_in, &st->analysis_mem[OVERLAP_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
+  frame_analysis(st, X, Ex, in);
+  logMax = -2;
+  follow = -2;
+  for (i=0;i<NB_BANDS;i++) {
+    Ly[i] = log10(1e-2+Ex[i]);
+    Ly[i] = MAX16(logMax-8, MAX16(follow-2.5f, Ly[i]));
+    logMax = MAX16(logMax, Ly[i]);
+    follow = MAX16(follow-2.5f, Ly[i]);
+    E += Ex[i];
+  }
+  dct(st->features[st->pcount], Ly);
+  st->features[st->pcount][0] -= 4;
+  lpc_from_cepstrum(st->lpc, st->features[st->pcount]);
+  for (i=0;i<LPC_ORDER;i++) st->features[st->pcount][NB_BANDS+2+i] = st->lpc[i];
+  RNN_MOVE(st->exc_buf, &st->exc_buf[FRAME_SIZE], PITCH_MAX_PERIOD);
+  RNN_COPY(&aligned_in[TRAINING_OFFSET], in, FRAME_SIZE-TRAINING_OFFSET);
+  for (i=0;i<FRAME_SIZE;i++) {
+    int j;
+    float sum = aligned_in[i];
+    for (j=0;j<LPC_ORDER;j++)
+      sum += st->lpc[j]*st->pitch_mem[j];
+    RNN_MOVE(st->pitch_mem+1, st->pitch_mem, LPC_ORDER-1);
+    st->pitch_mem[0] = aligned_in[i];
+    st->exc_buf[PITCH_MAX_PERIOD+i] = sum + .7f*st->pitch_filt;
+    st->pitch_filt = sum;
+    /*printf("%f\n", st->exc_buf[PITCH_MAX_PERIOD+i]);*/
+  }
+  /* Cross-correlation on half-frames. */
+  for (sub=0;sub<2;sub++) {
+    int off = sub*FRAME_SIZE/2;
+    double ener1;
+    celt_pitch_xcorr(&st->exc_buf[PITCH_MAX_PERIOD+off], st->exc_buf+off, xcorr, FRAME_SIZE/2, PITCH_MAX_PERIOD);
+    ener0 = celt_inner_prod(&st->exc_buf[PITCH_MAX_PERIOD+off], &st->exc_buf[PITCH_MAX_PERIOD+off], FRAME_SIZE/2);
+    ener1 = celt_inner_prod(&st->exc_buf[off], &st->exc_buf[off], FRAME_SIZE/2-1);
+    st->frame_weight[2+2*st->pcount+sub] = ener0;
+    /*printf("%f\n", st->frame_weight[2+2*st->pcount+sub]);*/
+    for (i=0;i<PITCH_MAX_PERIOD;i++) {
+      ener1 += st->exc_buf[i+off+FRAME_SIZE/2-1]*st->exc_buf[i+off+FRAME_SIZE/2-1];
+      ener = 1 + ener0 + ener1;
+      st->xc[2+2*st->pcount+sub][i] = 2*xcorr[i] / ener;
+      ener1 -= st->exc_buf[i+off]*st->exc_buf[i+off];
+    }
+    if (1) {
+      /* Upsample correlation by 3x and keep the max. */
+      float interpolated[PITCH_MAX_PERIOD]={0};
+      /* interp=sinc([-3:3]+1/3).*(.5+.5*cos(pi*[-3:3]/4.5)); interp=interp/sum(interp); */
+      static const float interp[7] = {0.026184f, -0.098339f, 0.369938f, 0.837891f, -0.184969f, 0.070242f, -0.020947f};
+      for (i=4;i<PITCH_MAX_PERIOD-4;i++) {
+        float val1=0, val2=0;
+        int j;
+        for (j=0;j<7;j++) {
+          val1 += st->xc[2+2*st->pcount+sub][i-3+j]*interp[j];
+          val2 += st->xc[2+2*st->pcount+sub][i+3-j]*interp[j];
+          interpolated[i] = MAX16(st->xc[2+2*st->pcount+sub][i], MAX16(val1, val2));
+        }
+      }
+      for (i=4;i<PITCH_MAX_PERIOD-4;i++) {
+        st->xc[2+2*st->pcount+sub][i] = interpolated[i];
+      }
+    }
+#if 0
+    for (i=0;i<PITCH_MAX_PERIOD;i++)
+      printf("%f ", st->xc[2*st->pcount+sub][i]);
+    printf("\n");
+#endif
+  }
+}
+
+void process_superframe(LPCNetEncState *st, unsigned char *buf, FILE *ffeat, int encode, int quantize) {
+  int i;
+  int sub;
+  int best_i;
+  int best[10];
+  int pitch_prev[8][PITCH_MAX_PERIOD];
+  float best_a=0;
+  float best_b=0;
+  float w;
+  float sx=0, sxx=0, sxy=0, sy=0, sw=0;
+  float frame_corr;
+  int voiced;
+  float frame_weight_sum = 1e-15f;
+  float center_pitch;
+  int main_pitch;
+  int modulation;
+  int corr_id = 0;
+#ifndef OPUS_BUILD
+  int c0_id=0;
+  int vq_end[3]={0};
+  int vq_mid=0;
+  int interp_id=0;
+#endif
+  for(sub=0;sub<8;sub++) frame_weight_sum += st->frame_weight[2+sub];
+  for(sub=0;sub<8;sub++) st->frame_weight[2+sub] *= (8.f/frame_weight_sum);
+  for(sub=0;sub<8;sub++) {
+    float max_path_all = -1e15f;
+    best_i = 0;
+    for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {
+      float xc_half = MAX16(MAX16(st->xc[2+sub][(PITCH_MAX_PERIOD+i)/2], st->xc[2+sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[2+sub][(PITCH_MAX_PERIOD+i-1)/2]);
+      if (st->xc[2+sub][i] < xc_half*1.1f) st->xc[2+sub][i] *= .8f;
+    }
+    for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {
+      int j;
+      float max_prev;
+      max_prev = st->pitch_max_path_all - 6.f;
+      pitch_prev[sub][i] = st->best_i;
+      for (j=IMAX(-4, -i);j<=4 && i+j<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;j++) {
+        if (st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j) > max_prev) {
+          max_prev = st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j);
+          pitch_prev[sub][i] = i+j;
+        }
+      }
+      st->pitch_max_path[1][i] = max_prev + st->frame_weight[2+sub]*st->xc[2+sub][i];
+      if (st->pitch_max_path[1][i] > max_path_all) {
+        max_path_all = st->pitch_max_path[1][i];
+        best_i = i;
+      }
+    }
+    /* Renormalize. */
+    for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;
+    /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);
+    printf("\n");*/
+    RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);
+    st->pitch_max_path_all = max_path_all;
+    st->best_i = best_i;
+  }
+  best_i = st->best_i;
+  frame_corr = 0;
+  /* Backward pass. */
+  for (sub=7;sub>=0;sub--) {
+    best[2+sub] = PITCH_MAX_PERIOD-best_i;
+    frame_corr += st->frame_weight[2+sub]*st->xc[2+sub][best_i];
+    best_i = pitch_prev[sub][best_i];
+  }
+  frame_corr /= 8;
+  if (quantize && frame_corr < 0) frame_corr = 0;
+  for (sub=0;sub<8;sub++) {
+    /*printf("%d %f\n", best[2+sub], frame_corr);*/
+  }
+  /*printf("\n");*/
+  for (sub=2;sub<10;sub++) {
+    w = st->frame_weight[sub];
+    sw += w;
+    sx += w*sub;
+    sxx += w*sub*sub;
+    sxy += w*sub*best[sub];
+    sy += w*best[sub];
+  }
+  voiced = frame_corr >= .3;
+  /* Linear regression to figure out the pitch contour. */
+  best_a = (sw*sxy - sx*sy)/(sw*sxx - sx*sx);
+  if (voiced) {
+    float max_a;
+    float mean_pitch = sy/sw;
+    /* Allow a relative variation of up to 1/4 over 8 sub-frames. */
+    max_a = mean_pitch/32;
+    best_a = MIN16(max_a, MAX16(-max_a, best_a));
+    corr_id = (int)floor((frame_corr-.3f)/.175f);
+    if (quantize) frame_corr = 0.3875f + .175f*corr_id;
+  } else {
+    best_a = 0;
+    corr_id = (int)floor(frame_corr/.075f);
+    if (quantize) frame_corr = 0.0375f + .075f*corr_id;
+  }
+  /*best_b = (sxx*sy - sx*sxy)/(sw*sxx - sx*sx);*/
+  best_b = (sy - best_a*sx)/sw;
+  /* Quantizing the pitch as "main" pitch + slope. */
+  center_pitch = best_b+5.5f*best_a;
+  main_pitch = (int)floor(.5 + 21.*1.442695041*log(center_pitch/PITCH_MIN_PERIOD));
+  main_pitch = IMAX(0, IMIN(63, main_pitch));
+  modulation = (int)floor(.5 + 16*7*best_a/center_pitch);
+  modulation = IMAX(-3, IMIN(3, modulation));
+  /*printf("%d %d\n", main_pitch, modulation);*/
+  /*printf("%f %f\n", best_a/center_pitch, best_corr);*/
+  /*for (sub=2;sub<10;sub++) printf("%f %d %f\n", best_b + sub*best_a, best[sub], best_corr);*/
+  for (sub=0;sub<4;sub++) {
+    if (quantize) {
+      float p = pow(2.f, main_pitch/21.)*PITCH_MIN_PERIOD;
+      p *= 1.f + modulation/16.f/7.f*(2*sub-3);
+      p = MIN16(255, MAX16(33, p));
+      st->features[sub][NB_BANDS] = .02f*(p-100);
+      st->features[sub][NB_BANDS + 1] = frame_corr-.5f;
+    } else {
+      st->features[sub][NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);
+      st->features[sub][NB_BANDS + 1] = frame_corr-.5f;
+    }
+    /*printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);*/
+  }
+  /*printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);*/
+  RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD);
+  RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD);
+#ifndef OPUS_BUILD
+  if (quantize) {
+    /*printf("%f\n", st->features[3][0]);*/
+    c0_id = (int)floor(.5 + st->features[3][0]*4);
+    c0_id = IMAX(-64, IMIN(63, c0_id));
+    st->features[3][0] = c0_id/4.f;
+    quantize_3stage_mbest(&st->features[3][1], vq_end);
+    /*perform_interp_relaxation(st->features, st->vq_mem);*/
+    quantize_diff(&st->features[1][0], st->vq_mem, &st->features[3][0], ceps_codebook_diff4, 12, 1, &vq_mid);
+    interp_id = double_interp_search(st->features, st->vq_mem);
+    perform_double_interp(st->features, st->vq_mem, interp_id);
+  }
+#endif
+  for (sub=0;sub<4;sub++) {
+    lpc_from_cepstrum(st->lpc, st->features[sub]);
+    for (i=0;i<LPC_ORDER;i++) st->features[sub][NB_BANDS+2+i] = st->lpc[i];
+  }
+  /*printf("\n");*/
+  RNN_COPY(st->vq_mem, &st->features[3][0], NB_BANDS);
+  if (encode) {
+#ifndef OPUS_BUILD
+    packer bits;
+    /*fprintf(stdout, "%d %d %d %d %d %d %d %d %d\n", c0_id+64, main_pitch, voiced ? modulation+4 : 0, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);*/
+    bits_packer_init(&bits, buf, 8);
+    bits_pack(&bits, c0_id+64, 7);
+    bits_pack(&bits, main_pitch, 6);
+    bits_pack(&bits, voiced ? modulation+4 : 0, 3);
+    bits_pack(&bits, corr_id, 2);
+    bits_pack(&bits, vq_end[0], 10);
+    bits_pack(&bits, vq_end[1], 10);
+    bits_pack(&bits, vq_end[2], 10);
+    bits_pack(&bits, vq_mid, 13);
+    bits_pack(&bits, interp_id, 3);
+    if (ffeat) fwrite(buf, 1, 8, ffeat);
+#else
+    (void)buf;
+#endif
+  } else if (ffeat) {
+    for (i=0;i<4;i++) {
+      fwrite(st->features[i], sizeof(float), NB_TOTAL_FEATURES, ffeat);
+    }
+  }
+}
+
+
+void process_multi_frame(LPCNetEncState *st, FILE *ffeat) {
+  int i;
+  int sub;
+  int best_i;
+  int best[10];
+  int pitch_prev[8][PITCH_MAX_PERIOD];
+  float frame_corr;
+  float frame_weight_sum = 1e-15f;
+  for(sub=0;sub<8;sub++) frame_weight_sum += st->frame_weight[2+sub];
+  for(sub=0;sub<8;sub++) st->frame_weight[2+sub] *= (8.f/frame_weight_sum);
+  for(sub=0;sub<8;sub++) {
+    float max_path_all = -1e15f;
+    best_i = 0;
+    for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {
+      float xc_half = MAX16(MAX16(st->xc[2+sub][(PITCH_MAX_PERIOD+i)/2], st->xc[2+sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[2+sub][(PITCH_MAX_PERIOD+i-1)/2]);
+      if (st->xc[2+sub][i] < xc_half*1.1) st->xc[2+sub][i] *= .8f;
+    }
+    for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {
+      int j;
+      float max_prev;
+      max_prev = st->pitch_max_path_all - 6.f;
+      pitch_prev[sub][i] = st->best_i;
+      for (j=IMAX(-4, -i);j<=4 && i+j<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;j++) {
+        if (st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j) > max_prev) {
+          max_prev = st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j);
+          pitch_prev[sub][i] = i+j;
+        }
+      }
+      st->pitch_max_path[1][i] = max_prev + st->frame_weight[2+sub]*st->xc[2+sub][i];
+      if (st->pitch_max_path[1][i] > max_path_all) {
+        max_path_all = st->pitch_max_path[1][i];
+        best_i = i;
+      }
+    }
+    /* Renormalize. */
+    for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;
+    /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);
+    printf("\n");*/
+    RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);
+    st->pitch_max_path_all = max_path_all;
+    st->best_i = best_i;
+  }
+  best_i = st->best_i;
+  frame_corr = 0;
+  /* Backward pass. */
+  for (sub=7;sub>=0;sub--) {
+    best[2+sub] = PITCH_MAX_PERIOD-best_i;
+    frame_corr += st->frame_weight[2+sub]*st->xc[2+sub][best_i];
+    best_i = pitch_prev[sub][best_i];
+  }
+  frame_corr /= 8;
+  for (sub=0;sub<4;sub++) {
+    st->features[sub][NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);
+    st->features[sub][NB_BANDS + 1] = frame_corr-.5f;
+    /*printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);*/
+  }
+  /*printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);*/
+  RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD);
+  RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD);
+  /*printf("\n");*/
+  RNN_COPY(st->vq_mem, &st->features[3][0], NB_BANDS);
+  if (ffeat) {
+    for (i=0;i<4;i++) {
+      fwrite(st->features[i], sizeof(float), NB_TOTAL_FEATURES, ffeat);
+    }
+  }
+}
+
+void process_single_frame(LPCNetEncState *st, FILE *ffeat) {
+  int i;
+  int sub;
+  int best_i;
+  int best[4];
+  int pitch_prev[2][PITCH_MAX_PERIOD];
+  float frame_corr;
+  float frame_weight_sum = 1e-15f;
+  for(sub=0;sub<2;sub++) frame_weight_sum += st->frame_weight[2+2*st->pcount+sub];
+  for(sub=0;sub<2;sub++) st->frame_weight[2+2*st->pcount+sub] *= (2.f/frame_weight_sum);
+  for(sub=0;sub<2;sub++) {
+    float max_path_all = -1e15f;
+    best_i = 0;
+    for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {
+      float xc_half = MAX16(MAX16(st->xc[2+2*st->pcount+sub][(PITCH_MAX_PERIOD+i)/2], st->xc[2+2*st->pcount+sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[2+2*st->pcount+sub][(PITCH_MAX_PERIOD+i-1)/2]);
+      if (st->xc[2+2*st->pcount+sub][i] < xc_half*1.1f) st->xc[2+2*st->pcount+sub][i] *= .8f;
+    }
+    for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {
+      int j;
+      float max_prev;
+      max_prev = st->pitch_max_path_all - 6.f;
+      pitch_prev[sub][i] = st->best_i;
+      for (j=IMAX(-4, -i);j<=4 && i+j<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;j++) {
+        if (st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j) > max_prev) {
+          max_prev = st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j);
+          pitch_prev[sub][i] = i+j;
+        }
+      }
+      st->pitch_max_path[1][i] = max_prev + st->frame_weight[2+2*st->pcount+sub]*st->xc[2+2*st->pcount+sub][i];
+      if (st->pitch_max_path[1][i] > max_path_all) {
+        max_path_all = st->pitch_max_path[1][i];
+        best_i = i;
+      }
+    }
+    /* Renormalize. */
+    for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;
+    /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);
+    printf("\n");*/
+    RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);
+    st->pitch_max_path_all = max_path_all;
+    st->best_i = best_i;
+  }
+  best_i = st->best_i;
+  frame_corr = 0;
+  /* Backward pass. */
+  for (sub=1;sub>=0;sub--) {
+    best[2+sub] = PITCH_MAX_PERIOD-best_i;
+    frame_corr += st->frame_weight[2+2*st->pcount+sub]*st->xc[2+2*st->pcount+sub][best_i];
+    best_i = pitch_prev[sub][best_i];
+  }
+  frame_corr /= 2;
+  st->features[st->pcount][NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2]+best[3]))-200);
+  st->features[st->pcount][NB_BANDS + 1] = frame_corr-.5f;
+  if (ffeat) {
+    fwrite(st->features[st->pcount], sizeof(float), NB_TOTAL_FEATURES, ffeat);
+  }
+}
+
+void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
+  int i;
+  for (i=0;i<N;i++) {
+    float yi;
+    yi = x[i] + *mem;
+    *mem = -coef*x[i];
+    y[i] = yi;
+  }
+}
+
+LPCNET_EXPORT int lpcnet_encode(LPCNetEncState *st, const short *pcm, unsigned char *buf) {
+  int i, k;
+  for (k=0;k<4;k++) {
+    float x[FRAME_SIZE];
+    for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[k*FRAME_SIZE + i];
+    preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+    st->pcount = k;
+    compute_frame_features(st, x);
+  }
+  process_superframe(st, buf, NULL, 1, 1);
+  return 0;
+}
+
+LPCNET_EXPORT int lpcnet_compute_features(LPCNetEncState *st, const short *pcm, float features[4][NB_TOTAL_FEATURES]) {
+  int i, k;
+  for (k=0;k<4;k++) {
+    float x[FRAME_SIZE];
+    for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[k*FRAME_SIZE + i];
+    preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+    st->pcount = k;
+    compute_frame_features(st, x);
+  }
+  process_superframe(st, NULL, NULL, 0, 0);
+  for (k=0;k<4;k++) {
+    RNN_COPY(&features[k][0], &st->features[k][0], NB_TOTAL_FEATURES);
+  }
+  return 0;
+}
+
+static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) {
+  preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+  compute_frame_features(st, x);
+  process_single_frame(st, NULL);
+  RNN_COPY(features, &st->features[0][0], NB_TOTAL_FEATURES);
+  return 0;
+}
+
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const short *pcm, float features[NB_TOTAL_FEATURES]) {
+  int i;
+  float x[FRAME_SIZE];
+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+  lpcnet_compute_single_frame_features_impl(st, x, features);
+  return 0;
+}
+
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) {
+  int i;
+  float x[FRAME_SIZE];
+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+  lpcnet_compute_single_frame_features_impl(st, x, features);
+  return 0;
+}
diff --git a/dnn/lpcnet_plc.c b/dnn/lpcnet_plc.c
new file mode 100644
index 00000000..a104c1d9
--- /dev/null
+++ b/dnn/lpcnet_plc.c
@@ -0,0 +1,503 @@
+/* Copyright (c) 2021 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "lpcnet_private.h"
+#include "lpcnet.h"
+#include "plc_data.h"
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+/* Comment this out to have LPCNet update its state on every good packet (slow). */
+#define PLC_SKIP_UPDATES
+
+LPCNET_EXPORT int lpcnet_plc_get_size() {
+  return sizeof(LPCNetPLCState);
+}
+
+LPCNET_EXPORT void lpcnet_plc_reset(LPCNetPLCState *st) {
+  RNN_CLEAR((char*)&st->LPCNET_PLC_RESET_START,
+          sizeof(LPCNetPLCState)-
+          ((char*)&st->LPCNET_PLC_RESET_START - (char*)st));
+  lpcnet_reset(&st->lpcnet);
+  lpcnet_encoder_init(&st->enc);
+  RNN_CLEAR(st->pcm, PLC_BUF_SIZE);
+  st->pcm_fill = PLC_BUF_SIZE;
+  st->skip_analysis = 0;
+  st->blend = 0;
+  st->loss_count = 0;
+  st->dc_mem = 0;
+  st->queued_update = 0;
+}
+
+LPCNET_EXPORT int lpcnet_plc_init(LPCNetPLCState *st, int options) {
+  int ret;
+  lpcnet_init(&st->lpcnet);
+  lpcnet_encoder_init(&st->enc);
+  if ((options&0x3) == LPCNET_PLC_CAUSAL) {
+    st->enable_blending = 1;
+    st->non_causal = 0;
+  } else if ((options&0x3) == LPCNET_PLC_NONCAUSAL) {
+    st->enable_blending = 1;
+    st->non_causal = 1;
+  } else if ((options&0x3) == LPCNET_PLC_CODEC) {
+    st->enable_blending = 0;
+    st->non_causal = 0;
+  } else {
+    return -1;
+  }
+  st->remove_dc = !!(options&LPCNET_PLC_DC_FILTER);
+#ifndef USE_WEIGHTS_FILE
+  ret = init_plc_model(&st->model, lpcnet_plc_arrays);
+#else
+  ret = 0;
+#endif
+  celt_assert(ret == 0);
+  lpcnet_plc_reset(st);
+  return ret;
+}
+
+LPCNET_EXPORT int lpcnet_plc_load_model(LPCNetPLCState *st, const unsigned char *data, int len) {
+  WeightArray *list;
+  int ret;
+  parse_weights(&list, data, len);
+  ret = init_plc_model(&st->model, list);
+  free(list);
+  if (ret == 0) {
+    return lpcnet_load_model(&st->lpcnet, data, len);
+  }
+  else return -1;
+}
+
+LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create(int options) {
+  LPCNetPLCState *st;
+  st = calloc(sizeof(*st), 1);
+  lpcnet_plc_init(st, options);
+  return st;
+}
+
+LPCNET_EXPORT void lpcnet_plc_destroy(LPCNetPLCState *st) {
+  free(st);
+}
+
+void lpcnet_plc_fec_add(LPCNetPLCState *st, const float *features) {
+  if (features == NULL) {
+    st->fec_skip++;
+    return;
+  }
+  if (st->fec_fill_pos == PLC_MAX_FEC) {
+    if (st->fec_keep_pos == 0) {
+      fprintf(stderr, "FEC buffer full\n");
+      return;
+    }
+    RNN_MOVE(&st->fec[0][0], &st->fec[st->fec_keep_pos][0], (st->fec_fill_pos-st->fec_keep_pos)*NB_FEATURES);
+    st->fec_fill_pos = st->fec_fill_pos-st->fec_keep_pos;
+    st->fec_read_pos -= st->fec_keep_pos;
+    st->fec_keep_pos = 0;
+  }
+  RNN_COPY(&st->fec[st->fec_fill_pos][0], features, NB_FEATURES);
+  st->fec_fill_pos++;
+}
+
+void lpcnet_plc_fec_clear(LPCNetPLCState *st) {
+  st->fec_keep_pos = st->fec_read_pos = st->fec_fill_pos = st-> fec_skip = 0;
+}
+
+
+static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) {
+  float zeros[3*PLC_MAX_RNN_NEURONS] = {0};
+  float dense_out[PLC_DENSE1_OUT_SIZE];
+  PLCNetState *net = &st->plc_net;
+  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);
+  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);
+  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
+  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);
+  /* Artificially boost the correlation to make harmonics cleaner. */
+  out[19] = MIN16(.5f, out[19]+.1f);
+}
+
+static int get_fec_or_pred(LPCNetPLCState *st, float *out) {
+  if (st->fec_read_pos != st->fec_fill_pos && st->fec_skip==0) {
+    float plc_features[2*NB_BANDS+NB_FEATURES+1] = {0};
+    float discard[NB_FEATURES];
+    RNN_COPY(out, &st->fec[st->fec_read_pos][0], NB_FEATURES);
+    st->fec_read_pos++;
+    /* Make sure we can rewind a few frames back at resync time. */
+    st->fec_keep_pos = IMAX(0, IMAX(st->fec_keep_pos, st->fec_read_pos-FEATURES_DELAY-1));
+    /* Update PLC state using FEC, so without Burg features. */
+    RNN_COPY(&plc_features[2*NB_BANDS], out, NB_FEATURES);
+    plc_features[2*NB_BANDS+NB_FEATURES] = -1;
+    compute_plc_pred(st, discard, plc_features);
+    return 1;
+  } else {
+    float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+    compute_plc_pred(st, out, zeros);
+    if (st->fec_skip > 0) st->fec_skip--;
+    return 0;
+  }
+}
+
+static void fec_rewind(LPCNetPLCState *st, int offset) {
+  st->fec_read_pos -= offset;
+  if (st->fec_read_pos < st->fec_keep_pos) {
+    st->fec_read_pos = st->fec_keep_pos;
+  }
+}
+
+void clear_state(LPCNetPLCState *st) {
+  RNN_CLEAR(st->lpcnet.last_sig, LPC_ORDER);
+  st->lpcnet.last_exc = lin2ulaw(0.f);
+  st->lpcnet.deemph_mem = 0;
+  RNN_CLEAR(st->lpcnet.nnet.gru_a_state, GRU_A_STATE_SIZE);
+  RNN_CLEAR(st->lpcnet.nnet.gru_b_state, GRU_B_STATE_SIZE);
+}
+
+#define DC_CONST 0.003
+
+/* In this causal version of the code, the DNN model implemented by compute_plc_pred()
+   needs to generate two feature vectors to conceal the first lost packet.*/
+
+static int lpcnet_plc_update_causal(LPCNetPLCState *st, short *pcm) {
+  int i;
+  float x[FRAME_SIZE];
+  short output[FRAME_SIZE];
+  float plc_features[2*NB_BANDS+NB_FEATURES+1];
+  short lp[FRAME_SIZE]={0};
+  int delta = 0;
+  if (st->remove_dc) {
+    st->dc_mem += st->syn_dc;
+    delta = st->syn_dc;
+    st->syn_dc = 0;
+    for (i=0;i<FRAME_SIZE;i++) {
+      lp[i] = (int)floor(.5 + st->dc_mem);
+      st->dc_mem += DC_CONST*(pcm[i] - st->dc_mem);
+      pcm[i] -= lp[i];
+    }
+  }
+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+  burg_cepstral_analysis(plc_features, x);
+  st->enc.pcount = 0;
+  if (st->skip_analysis) {
+    /*fprintf(stderr, "skip update\n");*/
+    if (st->blend) {
+      short tmp[FRAME_SIZE-TRAINING_OFFSET];
+      float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+      RNN_COPY(zeros, plc_features, 2*NB_BANDS);
+      zeros[2*NB_BANDS+NB_FEATURES] = 1;
+      if (st->enable_blending) {
+        LPCNetState copy;
+        st->plc_net = st->plc_copy[FEATURES_DELAY];
+        compute_plc_pred(st, st->features, zeros);
+        for (i=0;i<FEATURES_DELAY;i++) {
+          /* FIXME: backtrack state, replace features. */
+          run_frame_network_deferred(&st->lpcnet, st->features);
+        }
+        copy = st->lpcnet;
+        lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], tmp, FRAME_SIZE-TRAINING_OFFSET, 0);
+        for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) {
+          float w;
+          w = .5 - .5*cos(M_PI*i/(FRAME_SIZE-TRAINING_OFFSET));
+          pcm[i] = (int)floor(.5 + w*pcm[i] + (1-w)*(tmp[i]-delta));
+        }
+        st->lpcnet = copy;
+        lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+      } else {
+        if (FEATURES_DELAY > 0) st->plc_net = st->plc_copy[FEATURES_DELAY-1];
+        fec_rewind(st, FEATURES_DELAY);
+#ifdef PLC_SKIP_UPDATES
+        lpcnet_reset_signal(&st->lpcnet);
+#else
+        RNN_COPY(tmp, pcm, FRAME_SIZE-TRAINING_OFFSET);
+        lpcnet_synthesize_tail_impl(&st->lpcnet, tmp, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+#endif
+      }
+      RNN_COPY(st->pcm, &pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
+      st->pcm_fill = TRAINING_OFFSET;
+    } else {
+      RNN_COPY(&st->pcm[st->pcm_fill], pcm, FRAME_SIZE);
+      st->pcm_fill += FRAME_SIZE;
+    }
+  }
+  /* Update state. */
+  /*fprintf(stderr, "update state\n");*/
+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+  preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+  compute_frame_features(&st->enc, x);
+  process_single_frame(&st->enc, NULL);
+  if (!st->blend) {
+    RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
+    plc_features[2*NB_BANDS+NB_FEATURES] = 1;
+    compute_plc_pred(st, st->features, plc_features);
+    /* Discard an FEC frame that we know we will no longer need. */
+    if (st->fec_skip) st->fec_skip--;
+    else if (st->fec_read_pos < st->fec_fill_pos) st->fec_read_pos++;
+    st->fec_keep_pos = IMAX(0, IMAX(st->fec_keep_pos, st->fec_read_pos-FEATURES_DELAY-1));
+  }
+  if (st->skip_analysis) {
+    if (st->enable_blending) {
+      /* FIXME: backtrack state, replace features. */
+      run_frame_network_deferred(&st->lpcnet, st->enc.features[0]);
+    }
+    st->skip_analysis--;
+  } else {
+    for (i=0;i<FRAME_SIZE;i++) st->pcm[PLC_BUF_SIZE+i] = pcm[i];
+    RNN_COPY(output, &st->pcm[0], FRAME_SIZE);
+#ifdef PLC_SKIP_UPDATES
+    {
+      run_frame_network_deferred(&st->lpcnet, st->enc.features[0]);
+    }
+#else
+    lpcnet_synthesize_impl(&st->lpcnet, st->enc.features[0], output, FRAME_SIZE, FRAME_SIZE);
+#endif
+    RNN_MOVE(st->pcm, &st->pcm[FRAME_SIZE], PLC_BUF_SIZE);
+  }
+  st->loss_count = 0;
+  if (st->remove_dc) {
+    for (i=0;i<FRAME_SIZE;i++) {
+      pcm[i] += lp[i];
+    }
+  }
+  st->blend = 0;
+  return 0;
+}
+
+static const float att_table[10] = {0, 0,  -.2, -.2,  -.4, -.4,  -.8, -.8, -1.6, -1.6};
+static int lpcnet_plc_conceal_causal(LPCNetPLCState *st, short *pcm) {
+  int i;
+  short output[FRAME_SIZE];
+  run_frame_network_flush(&st->lpcnet);
+  st->enc.pcount = 0;
+  /* If we concealed the previous frame, finish synthesizing the rest of the samples. */
+  /* FIXME: Copy/predict features. */
+  while (st->pcm_fill > 0) {
+    /*fprintf(stderr, "update state for PLC %d\n", st->pcm_fill);*/
+    int update_count;
+    update_count = IMIN(st->pcm_fill, FRAME_SIZE);
+    RNN_COPY(output, &st->pcm[0], update_count);
+    RNN_MOVE(&st->plc_copy[1], &st->plc_copy[0], FEATURES_DELAY);
+    st->plc_copy[0] = st->plc_net;
+    get_fec_or_pred(st, st->features);
+    lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], output, update_count, update_count);
+    RNN_MOVE(st->pcm, &st->pcm[FRAME_SIZE], PLC_BUF_SIZE);
+    st->pcm_fill -= update_count;
+    st->skip_analysis++;
+  }
+  RNN_MOVE(&st->plc_copy[1], &st->plc_copy[0], FEATURES_DELAY);
+  st->plc_copy[0] = st->plc_net;
+  lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, 0);
+  if (get_fec_or_pred(st, st->features)) st->loss_count = 0;
+  else st->loss_count++;
+  if (st->loss_count >= 10) st->features[0] = MAX16(-10, st->features[0]+att_table[9] - 2*(st->loss_count-9));
+  else st->features[0] = MAX16(-10, st->features[0]+att_table[st->loss_count]);
+  lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], &pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, 0);
+  {
+    float x[FRAME_SIZE];
+    /* FIXME: Can we do better? */
+    for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+    preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+    compute_frame_features(&st->enc, x);
+    process_single_frame(&st->enc, NULL);
+  }
+  st->blend = 1;
+  if (st->remove_dc) {
+    for (i=0;i<FRAME_SIZE;i++) {
+      st->syn_dc += DC_CONST*(pcm[i] - st->syn_dc);
+      pcm[i] += (int)floor(.5 + st->dc_mem);
+    }
+  }
+  return 0;
+}
+
+/* In this non-causal version of the code, the DNN model implemented by compute_plc_pred()
+   is always called once per frame. We process audio up to the current position minus TRAINING_OFFSET. */
+
+void process_queued_update(LPCNetPLCState *st) {
+  if (st->queued_update) {
+    lpcnet_synthesize_impl(&st->lpcnet, st->features, st->queued_samples, FRAME_SIZE, FRAME_SIZE);
+    st->queued_update=0;
+  }
+}
+
+static int lpcnet_plc_update_non_causal(LPCNetPLCState *st, short *pcm) {
+  int i;
+  float x[FRAME_SIZE];
+  short pcm_save[FRAME_SIZE];
+  float plc_features[2*NB_BANDS+NB_FEATURES+1];
+  short lp[FRAME_SIZE]={0};
+  double mem_bak=0;
+  int delta = st->syn_dc;
+  if (FEATURES_DELAY != 0) {
+    fprintf(stderr, "Non-causal PLC cannot work with non-zero FEATURES_DELAY\n");
+    fprintf(stderr, "Recompile with a no-lookahead model (see README.md)\n");
+    exit(1);
+  }
+  process_queued_update(st);
+  if (st->remove_dc) {
+    st->dc_mem += st->syn_dc;
+    st->syn_dc = 0;
+    mem_bak = st->dc_mem;
+    for (i=0;i<FRAME_SIZE;i++) {
+      lp[i] = (int)floor(.5 + st->dc_mem);
+      st->dc_mem += DC_CONST*(pcm[i] - st->dc_mem);
+      pcm[i] -= lp[i];
+    }
+  }
+  RNN_COPY(pcm_save, pcm, FRAME_SIZE);
+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+  burg_cepstral_analysis(plc_features, x);
+  st->enc.pcount = 0;
+  if (st->loss_count > 0) {
+    LPCNetState copy;
+    /* Handle blending. */
+    float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+    RNN_COPY(zeros, plc_features, 2*NB_BANDS);
+    zeros[2*NB_BANDS+NB_FEATURES] = 1;
+    compute_plc_pred(st, st->features, zeros);
+    copy = st->lpcnet;
+    lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, 0);
+    /* Undo initial DC offset removal so that we can take into account the last 5ms of synthesis. */
+    if (st->remove_dc) {
+      for (i=0;i<FRAME_SIZE;i++) pcm[i] += lp[i];
+      st->dc_mem = mem_bak;
+      for (i=0;i<TRAINING_OFFSET;i++) st->syn_dc += DC_CONST*(st->pcm[FRAME_SIZE-TRAINING_OFFSET+i] - st->syn_dc);
+      st->dc_mem += st->syn_dc;
+      delta += st->syn_dc;
+      st->syn_dc = 0;
+      for (i=0;i<FRAME_SIZE;i++) {
+        lp[i] = (int)floor(.5 + st->dc_mem);
+        st->dc_mem += DC_CONST*(pcm[i] - st->dc_mem);
+        pcm[i] -= lp[i];
+      }
+      RNN_COPY(pcm_save, pcm, FRAME_SIZE);
+    }
+    {
+      short rev[FRAME_SIZE];
+      for (i=0;i<FRAME_SIZE;i++) rev[i] = pcm[FRAME_SIZE-i-1];
+      clear_state(st);
+      lpcnet_synthesize_impl(&st->lpcnet, st->features, rev, FRAME_SIZE, FRAME_SIZE);
+      lpcnet_synthesize_tail_impl(&st->lpcnet, rev, TRAINING_OFFSET, 0);
+      for (i=0;i<TRAINING_OFFSET;i++) {
+        float w;
+        w = .5 - .5*cos(M_PI*i/(TRAINING_OFFSET));
+        st->pcm[FRAME_SIZE-1-i] = (int)floor(.5 + w*st->pcm[FRAME_SIZE-1-i] + (1-w)*(rev[i]+delta));
+      }
+      
+    }
+    st->lpcnet = copy;
+#if 1
+    st->queued_update = 1;
+    RNN_COPY(&st->queued_samples[0], &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
+    RNN_COPY(&st->queued_samples[TRAINING_OFFSET], pcm, FRAME_SIZE-TRAINING_OFFSET);
+#else
+    lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
+    lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+#endif
+    for (i=0;i<FRAME_SIZE;i++) x[i] = st->pcm[i];
+    preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+    compute_frame_features(&st->enc, x);
+    process_single_frame(&st->enc, NULL);
+    
+  }
+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+  preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+  compute_frame_features(&st->enc, x);
+  process_single_frame(&st->enc, NULL);
+  if (st->loss_count == 0) {
+    RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
+    plc_features[2*NB_BANDS+NB_FEATURES] = 1;
+    compute_plc_pred(st, st->features, plc_features);
+    lpcnet_synthesize_impl(&st->lpcnet, st->enc.features[0], &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
+    lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+  }
+  RNN_COPY(&pcm[FRAME_SIZE-TRAINING_OFFSET], pcm, TRAINING_OFFSET);
+  RNN_COPY(pcm, &st->pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET);
+  RNN_COPY(st->pcm, pcm_save, FRAME_SIZE);
+  st->loss_count = 0;
+  if (st->remove_dc) {
+    for (i=0;i<TRAINING_OFFSET;i++) pcm[i] += st->dc_buf[i];
+    for (;i<FRAME_SIZE;i++) pcm[i] += lp[i-TRAINING_OFFSET];
+    for (i=0;i<TRAINING_OFFSET;i++) st->dc_buf[i] = lp[FRAME_SIZE-TRAINING_OFFSET+i];
+  }
+  return 0;
+}
+
+static int lpcnet_plc_conceal_non_causal(LPCNetPLCState *st, short *pcm) {
+  int i;
+  float x[FRAME_SIZE];
+  float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+  process_queued_update(st);
+  st->enc.pcount = 0;
+
+  compute_plc_pred(st, st->features, zeros);
+  if (st->loss_count >= 10) st->features[0] = MAX16(-10, st->features[0]+att_table[9] - 2*(st->loss_count-9));
+  else st->features[0] = MAX16(-10, st->features[0]+att_table[st->loss_count]);
+
+  if (st->loss_count == 0) {
+    RNN_COPY(pcm, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
+    lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
+    lpcnet_synthesize_tail_impl(&st->lpcnet, &pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET, 0);
+  } else {
+    lpcnet_synthesize_impl(&st->lpcnet, st->features, pcm, TRAINING_OFFSET, 0);
+    lpcnet_synthesize_tail_impl(&st->lpcnet, &pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET, 0);
+
+    RNN_COPY(&st->pcm[FRAME_SIZE-TRAINING_OFFSET], pcm, TRAINING_OFFSET);
+    for (i=0;i<FRAME_SIZE;i++) x[i] = st->pcm[i];
+    preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+    compute_frame_features(&st->enc, x);
+    process_single_frame(&st->enc, NULL);
+  }
+  RNN_COPY(st->pcm, &pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET);
+
+  if (st->remove_dc) {
+    int dc = (int)floor(.5 + st->dc_mem);
+    if (st->loss_count == 0) {
+        for (i=TRAINING_OFFSET;i<FRAME_SIZE;i++) st->syn_dc += DC_CONST*(pcm[i] - st->syn_dc);
+    } else {
+        for (i=0;i<FRAME_SIZE;i++) st->syn_dc += DC_CONST*(pcm[i] - st->syn_dc);
+    }
+    for (i=0;i<TRAINING_OFFSET;i++) pcm[i] += st->dc_buf[i];
+    for (;i<FRAME_SIZE;i++) pcm[i] += dc;
+    for (i=0;i<TRAINING_OFFSET;i++) st->dc_buf[i] = dc;
+  }
+  st->loss_count++;
+  return 0;
+}
+
+
+LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) {
+  if (st->non_causal) return lpcnet_plc_update_non_causal(st, pcm);
+  else return lpcnet_plc_update_causal(st, pcm);
+}
+
+LPCNET_EXPORT int lpcnet_plc_conceal(LPCNetPLCState *st, short *pcm) {
+  if (st->non_causal) return lpcnet_plc_conceal_non_causal(st, pcm);
+  else return lpcnet_plc_conceal_causal(st, pcm);
+}
diff --git a/dnn/lpcnet_private.h b/dnn/lpcnet_private.h
new file mode 100644
index 00000000..3a85847e
--- /dev/null
+++ b/dnn/lpcnet_private.h
@@ -0,0 +1,142 @@
+#ifndef LPCNET_PRIVATE_H
+#define LPCNET_PRIVATE_H
+
+#include <stdio.h>
+#include "common.h"
+#include "freq.h"
+#include "lpcnet.h"
+#include "nnet_data.h"
+#include "plc_data.h"
+#include "kiss99.h"
+
+#define BITS_PER_CHAR 8
+
+#define PITCH_MIN_PERIOD 32
+#define PITCH_MAX_PERIOD 256
+
+#define PITCH_FRAME_SIZE 320
+#define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
+
+#define MULTI 4
+#define MULTI_MASK (MULTI-1)
+
+#define FORBIDDEN_INTERP 7
+
+#define PLC_MAX_FEC 100
+#define MAX_FEATURE_BUFFER_SIZE 4
+
+struct LPCNetState {
+    LPCNetModel model;
+    float sampling_logit_table[256];
+    kiss99_ctx rng;
+
+#define LPCNET_RESET_START nnet
+    NNetState nnet;
+    int last_exc;
+    float last_sig[LPC_ORDER];
+    float feature_buffer[NB_FEATURES*MAX_FEATURE_BUFFER_SIZE];
+    int feature_buffer_fill;
+    float last_features[NB_FEATURES];
+#if FEATURES_DELAY>0
+    float old_lpc[FEATURES_DELAY][LPC_ORDER];
+#endif
+    float gru_a_condition[3*GRU_A_STATE_SIZE];
+    float gru_b_condition[3*GRU_B_STATE_SIZE];
+    int frame_count;
+    float deemph_mem;
+    float lpc[LPC_ORDER];
+};
+
+struct LPCNetDecState {
+    LPCNetState lpcnet_state;
+    float vq_mem[NB_BANDS];
+};
+
+struct LPCNetEncState{
+  float analysis_mem[OVERLAP_SIZE];
+  float mem_preemph;
+  int pcount;
+  float pitch_mem[LPC_ORDER];
+  float pitch_filt;
+  float xc[10][PITCH_MAX_PERIOD+1];
+  float frame_weight[10];
+  float exc_buf[PITCH_BUF_SIZE];
+  float pitch_max_path[2][PITCH_MAX_PERIOD];
+  float pitch_max_path_all;
+  int best_i;
+  float last_gain;
+  int last_period;
+  float lpc[LPC_ORDER];
+  float vq_mem[NB_BANDS];
+  float features[4][NB_TOTAL_FEATURES];
+  float sig_mem[LPC_ORDER];
+  int exc_mem;
+  float burg_cepstrum[2*NB_BANDS];
+};
+
+#define PLC_BUF_SIZE (FEATURES_DELAY*FRAME_SIZE + TRAINING_OFFSET)
+struct LPCNetPLCState {
+  PLCModel model;
+  LPCNetState lpcnet;
+  LPCNetEncState enc;
+  int enable_blending;
+  int non_causal;
+  int remove_dc;
+
+#define LPCNET_PLC_RESET_START fec
+  float fec[PLC_MAX_FEC][NB_FEATURES];
+  int fec_keep_pos;
+  int fec_read_pos;
+  int fec_fill_pos;
+  int fec_skip;
+  short pcm[PLC_BUF_SIZE+FRAME_SIZE];
+  int pcm_fill;
+  int skip_analysis;
+  int blend;
+  float features[NB_TOTAL_FEATURES];
+  int loss_count;
+  PLCNetState plc_net;
+  PLCNetState plc_copy[FEATURES_DELAY+1];
+  double dc_mem;
+  double syn_dc;
+
+  short dc_buf[TRAINING_OFFSET];
+  int queued_update;
+  short queued_samples[FRAME_SIZE];
+};
+
+#ifndef OPUS_BUILD
+extern float ceps_codebook1[];
+extern float ceps_codebook2[];
+extern float ceps_codebook3[];
+extern float ceps_codebook_diff4[];
+#endif
+
+void preemphasis(float *y, float *mem, const float *x, float coef, int N);
+
+void perform_double_interp(float features[4][NB_TOTAL_FEATURES], const float *mem, int best_id);
+
+void process_superframe(LPCNetEncState *st, unsigned char *buf, FILE *ffeat, int encode, int quantize);
+
+void compute_frame_features(LPCNetEncState *st, const float *in);
+
+void decode_packet(float features[4][NB_TOTAL_FEATURES], float *vq_mem, const unsigned char buf[8]);
+
+void lpcnet_reset_signal(LPCNetState *lpcnet);
+void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
+void run_frame_network_deferred(LPCNetState *lpcnet, const float *features);
+void run_frame_network_flush(LPCNetState *lpcnet);
+
+
+void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, short *output, int N, int preload);
+void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, short *output, int N, int preload);
+void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const short *pcm_in, short *output, int N);
+void process_single_frame(LPCNetEncState *st, FILE *ffeat);
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const short *pcm, float features[NB_TOTAL_FEATURES]);
+
+void process_single_frame(LPCNetEncState *st, FILE *ffeat);
+
+void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
+
+int parse_weights(WeightArray **list, const unsigned char *data, int len);
+#endif
diff --git a/dnn/lpcnet_tables.c b/dnn/lpcnet_tables.c
new file mode 100644
index 00000000..467cb473
--- /dev/null
+++ b/dnn/lpcnet_tables.c
@@ -0,0 +1,307 @@
+/* The contents of this file was automatically generated by dump_lpcnet_tables.c*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "kiss_fft.h"
+
+static const arch_fft_state arch_fft = {0, NULL};
+
+static const opus_int16 fft_bitrev[320] = {
+0, 64, 128, 192, 256, 16, 80, 144, 208, 272, 32, 96, 160, 224, 288,
+48, 112, 176, 240, 304, 4, 68, 132, 196, 260, 20, 84, 148, 212, 276,
+36, 100, 164, 228, 292, 52, 116, 180, 244, 308, 8, 72, 136, 200, 264,
+24, 88, 152, 216, 280, 40, 104, 168, 232, 296, 56, 120, 184, 248, 312,
+12, 76, 140, 204, 268, 28, 92, 156, 220, 284, 44, 108, 172, 236, 300,
+60, 124, 188, 252, 316, 1, 65, 129, 193, 257, 17, 81, 145, 209, 273,
+33, 97, 161, 225, 289, 49, 113, 177, 241, 305, 5, 69, 133, 197, 261,
+21, 85, 149, 213, 277, 37, 101, 165, 229, 293, 53, 117, 181, 245, 309,
+9, 73, 137, 201, 265, 25, 89, 153, 217, 281, 41, 105, 169, 233, 297,
+57, 121, 185, 249, 313, 13, 77, 141, 205, 269, 29, 93, 157, 221, 285,
+45, 109, 173, 237, 301, 61, 125, 189, 253, 317, 2, 66, 130, 194, 258,
+18, 82, 146, 210, 274, 34, 98, 162, 226, 290, 50, 114, 178, 242, 306,
+6, 70, 134, 198, 262, 22, 86, 150, 214, 278, 38, 102, 166, 230, 294,
+54, 118, 182, 246, 310, 10, 74, 138, 202, 266, 26, 90, 154, 218, 282,
+42, 106, 170, 234, 298, 58, 122, 186, 250, 314, 14, 78, 142, 206, 270,
+30, 94, 158, 222, 286, 46, 110, 174, 238, 302, 62, 126, 190, 254, 318,
+3, 67, 131, 195, 259, 19, 83, 147, 211, 275, 35, 99, 163, 227, 291,
+51, 115, 179, 243, 307, 7, 71, 135, 199, 263, 23, 87, 151, 215, 279,
+39, 103, 167, 231, 295, 55, 119, 183, 247, 311, 11, 75, 139, 203, 267,
+27, 91, 155, 219, 283, 43, 107, 171, 235, 299, 59, 123, 187, 251, 315,
+15, 79, 143, 207, 271, 31, 95, 159, 223, 287, 47, 111, 175, 239, 303,
+63, 127, 191, 255, 319, };
+
+static const kiss_twiddle_cpx fft_twiddles[320] = {
+{1.00000000f, -0.00000000f}, {0.999807239f, -0.0196336918f},
+{0.999229014f, -0.0392598175f}, {0.998265624f, -0.0588708036f},
+{0.996917307f, -0.0784590989f}, {0.995184720f, -0.0980171412f},
+{0.993068457f, -0.117537394f}, {0.990569353f, -0.137012348f},
+{0.987688363f, -0.156434461f}, {0.984426558f, -0.175796285f},
+{0.980785251f, -0.195090324f}, {0.976765871f, -0.214309156f},
+{0.972369909f, -0.233445361f}, {0.967599094f, -0.252491564f},
+{0.962455213f, -0.271440446f}, {0.956940353f, -0.290284663f},
+{0.951056540f, -0.309017003f}, {0.944806039f, -0.327630192f},
+{0.938191354f, -0.346117049f}, {0.931214929f, -0.364470512f},
+{0.923879504f, -0.382683426f}, {0.916187942f, -0.400748819f},
+{0.908143163f, -0.418659747f}, {0.899748266f, -0.436409235f},
+{0.891006529f, -0.453990489f}, {0.881921291f, -0.471396744f},
+{0.872496009f, -0.488621235f}, {0.862734377f, -0.505657375f},
+{0.852640152f, -0.522498548f}, {0.842217207f, -0.539138317f},
+{0.831469595f, -0.555570245f}, {0.820401430f, -0.571787953f},
+{0.809017003f, -0.587785244f}, {0.797320664f, -0.603555918f},
+{0.785316944f, -0.619093955f}, {0.773010433f, -0.634393275f},
+{0.760405958f, -0.649448037f}, {0.747508347f, -0.664252460f},
+{0.734322488f, -0.678800762f}, {0.720853567f, -0.693087339f},
+{0.707106769f, -0.707106769f}, {0.693087339f, -0.720853567f},
+{0.678800762f, -0.734322488f}, {0.664252460f, -0.747508347f},
+{0.649448037f, -0.760405958f}, {0.634393275f, -0.773010433f},
+{0.619093955f, -0.785316944f}, {0.603555918f, -0.797320664f},
+{0.587785244f, -0.809017003f}, {0.571787953f, -0.820401430f},
+{0.555570245f, -0.831469595f}, {0.539138317f, -0.842217207f},
+{0.522498548f, -0.852640152f}, {0.505657375f, -0.862734377f},
+{0.488621235f, -0.872496009f}, {0.471396744f, -0.881921291f},
+{0.453990489f, -0.891006529f}, {0.436409235f, -0.899748266f},
+{0.418659747f, -0.908143163f}, {0.400748819f, -0.916187942f},
+{0.382683426f, -0.923879504f}, {0.364470512f, -0.931214929f},
+{0.346117049f, -0.938191354f}, {0.327630192f, -0.944806039f},
+{0.309017003f, -0.951056540f}, {0.290284663f, -0.956940353f},
+{0.271440446f, -0.962455213f}, {0.252491564f, -0.967599094f},
+{0.233445361f, -0.972369909f}, {0.214309156f, -0.976765871f},
+{0.195090324f, -0.980785251f}, {0.175796285f, -0.984426558f},
+{0.156434461f, -0.987688363f}, {0.137012348f, -0.990569353f},
+{0.117537394f, -0.993068457f}, {0.0980171412f, -0.995184720f},
+{0.0784590989f, -0.996917307f}, {0.0588708036f, -0.998265624f},
+{0.0392598175f, -0.999229014f}, {0.0196336918f, -0.999807239f},
+{6.12323426e-17f, -1.00000000f}, {-0.0196336918f, -0.999807239f},
+{-0.0392598175f, -0.999229014f}, {-0.0588708036f, -0.998265624f},
+{-0.0784590989f, -0.996917307f}, {-0.0980171412f, -0.995184720f},
+{-0.117537394f, -0.993068457f}, {-0.137012348f, -0.990569353f},
+{-0.156434461f, -0.987688363f}, {-0.175796285f, -0.984426558f},
+{-0.195090324f, -0.980785251f}, {-0.214309156f, -0.976765871f},
+{-0.233445361f, -0.972369909f}, {-0.252491564f, -0.967599094f},
+{-0.271440446f, -0.962455213f}, {-0.290284663f, -0.956940353f},
+{-0.309017003f, -0.951056540f}, {-0.327630192f, -0.944806039f},
+{-0.346117049f, -0.938191354f}, {-0.364470512f, -0.931214929f},
+{-0.382683426f, -0.923879504f}, {-0.400748819f, -0.916187942f},
+{-0.418659747f, -0.908143163f}, {-0.436409235f, -0.899748266f},
+{-0.453990489f, -0.891006529f}, {-0.471396744f, -0.881921291f},
+{-0.488621235f, -0.872496009f}, {-0.505657375f, -0.862734377f},
+{-0.522498548f, -0.852640152f}, {-0.539138317f, -0.842217207f},
+{-0.555570245f, -0.831469595f}, {-0.571787953f, -0.820401430f},
+{-0.587785244f, -0.809017003f}, {-0.603555918f, -0.797320664f},
+{-0.619093955f, -0.785316944f}, {-0.634393275f, -0.773010433f},
+{-0.649448037f, -0.760405958f}, {-0.664252460f, -0.747508347f},
+{-0.678800762f, -0.734322488f}, {-0.693087339f, -0.720853567f},
+{-0.707106769f, -0.707106769f}, {-0.720853567f, -0.693087339f},
+{-0.734322488f, -0.678800762f}, {-0.747508347f, -0.664252460f},
+{-0.760405958f, -0.649448037f}, {-0.773010433f, -0.634393275f},
+{-0.785316944f, -0.619093955f}, {-0.797320664f, -0.603555918f},
+{-0.809017003f, -0.587785244f}, {-0.820401430f, -0.571787953f},
+{-0.831469595f, -0.555570245f}, {-0.842217207f, -0.539138317f},
+{-0.852640152f, -0.522498548f}, {-0.862734377f, -0.505657375f},
+{-0.872496009f, -0.488621235f}, {-0.881921291f, -0.471396744f},
+{-0.891006529f, -0.453990489f}, {-0.899748266f, -0.436409235f},
+{-0.908143163f, -0.418659747f}, {-0.916187942f, -0.400748819f},
+{-0.923879504f, -0.382683426f}, {-0.931214929f, -0.364470512f},
+{-0.938191354f, -0.346117049f}, {-0.944806039f, -0.327630192f},
+{-0.951056540f, -0.309017003f}, {-0.956940353f, -0.290284663f},
+{-0.962455213f, -0.271440446f}, {-0.967599094f, -0.252491564f},
+{-0.972369909f, -0.233445361f}, {-0.976765871f, -0.214309156f},
+{-0.980785251f, -0.195090324f}, {-0.984426558f, -0.175796285f},
+{-0.987688363f, -0.156434461f}, {-0.990569353f, -0.137012348f},
+{-0.993068457f, -0.117537394f}, {-0.995184720f, -0.0980171412f},
+{-0.996917307f, -0.0784590989f}, {-0.998265624f, -0.0588708036f},
+{-0.999229014f, -0.0392598175f}, {-0.999807239f, -0.0196336918f},
+{-1.00000000f, -1.22464685e-16f}, {-0.999807239f, 0.0196336918f},
+{-0.999229014f, 0.0392598175f}, {-0.998265624f, 0.0588708036f},
+{-0.996917307f, 0.0784590989f}, {-0.995184720f, 0.0980171412f},
+{-0.993068457f, 0.117537394f}, {-0.990569353f, 0.137012348f},
+{-0.987688363f, 0.156434461f}, {-0.984426558f, 0.175796285f},
+{-0.980785251f, 0.195090324f}, {-0.976765871f, 0.214309156f},
+{-0.972369909f, 0.233445361f}, {-0.967599094f, 0.252491564f},
+{-0.962455213f, 0.271440446f}, {-0.956940353f, 0.290284663f},
+{-0.951056540f, 0.309017003f}, {-0.944806039f, 0.327630192f},
+{-0.938191354f, 0.346117049f}, {-0.931214929f, 0.364470512f},
+{-0.923879504f, 0.382683426f}, {-0.916187942f, 0.400748819f},
+{-0.908143163f, 0.418659747f}, {-0.899748266f, 0.436409235f},
+{-0.891006529f, 0.453990489f}, {-0.881921291f, 0.471396744f},
+{-0.872496009f, 0.488621235f}, {-0.862734377f, 0.505657375f},
+{-0.852640152f, 0.522498548f}, {-0.842217207f, 0.539138317f},
+{-0.831469595f, 0.555570245f}, {-0.820401430f, 0.571787953f},
+{-0.809017003f, 0.587785244f}, {-0.797320664f, 0.603555918f},
+{-0.785316944f, 0.619093955f}, {-0.773010433f, 0.634393275f},
+{-0.760405958f, 0.649448037f}, {-0.747508347f, 0.664252460f},
+{-0.734322488f, 0.678800762f}, {-0.720853567f, 0.693087339f},
+{-0.707106769f, 0.707106769f}, {-0.693087339f, 0.720853567f},
+{-0.678800762f, 0.734322488f}, {-0.664252460f, 0.747508347f},
+{-0.649448037f, 0.760405958f}, {-0.634393275f, 0.773010433f},
+{-0.619093955f, 0.785316944f}, {-0.603555918f, 0.797320664f},
+{-0.587785244f, 0.809017003f}, {-0.571787953f, 0.820401430f},
+{-0.555570245f, 0.831469595f}, {-0.539138317f, 0.842217207f},
+{-0.522498548f, 0.852640152f}, {-0.505657375f, 0.862734377f},
+{-0.488621235f, 0.872496009f}, {-0.471396744f, 0.881921291f},
+{-0.453990489f, 0.891006529f}, {-0.436409235f, 0.899748266f},
+{-0.418659747f, 0.908143163f}, {-0.400748819f, 0.916187942f},
+{-0.382683426f, 0.923879504f}, {-0.364470512f, 0.931214929f},
+{-0.346117049f, 0.938191354f}, {-0.327630192f, 0.944806039f},
+{-0.309017003f, 0.951056540f}, {-0.290284663f, 0.956940353f},
+{-0.271440446f, 0.962455213f}, {-0.252491564f, 0.967599094f},
+{-0.233445361f, 0.972369909f}, {-0.214309156f, 0.976765871f},
+{-0.195090324f, 0.980785251f}, {-0.175796285f, 0.984426558f},
+{-0.156434461f, 0.987688363f}, {-0.137012348f, 0.990569353f},
+{-0.117537394f, 0.993068457f}, {-0.0980171412f, 0.995184720f},
+{-0.0784590989f, 0.996917307f}, {-0.0588708036f, 0.998265624f},
+{-0.0392598175f, 0.999229014f}, {-0.0196336918f, 0.999807239f},
+{-1.83697015e-16f, 1.00000000f}, {0.0196336918f, 0.999807239f},
+{0.0392598175f, 0.999229014f}, {0.0588708036f, 0.998265624f},
+{0.0784590989f, 0.996917307f}, {0.0980171412f, 0.995184720f},
+{0.117537394f, 0.993068457f}, {0.137012348f, 0.990569353f},
+{0.156434461f, 0.987688363f}, {0.175796285f, 0.984426558f},
+{0.195090324f, 0.980785251f}, {0.214309156f, 0.976765871f},
+{0.233445361f, 0.972369909f}, {0.252491564f, 0.967599094f},
+{0.271440446f, 0.962455213f}, {0.290284663f, 0.956940353f},
+{0.309017003f, 0.951056540f}, {0.327630192f, 0.944806039f},
+{0.346117049f, 0.938191354f}, {0.364470512f, 0.931214929f},
+{0.382683426f, 0.923879504f}, {0.400748819f, 0.916187942f},
+{0.418659747f, 0.908143163f}, {0.436409235f, 0.899748266f},
+{0.453990489f, 0.891006529f}, {0.471396744f, 0.881921291f},
+{0.488621235f, 0.872496009f}, {0.505657375f, 0.862734377f},
+{0.522498548f, 0.852640152f}, {0.539138317f, 0.842217207f},
+{0.555570245f, 0.831469595f}, {0.571787953f, 0.820401430f},
+{0.587785244f, 0.809017003f}, {0.603555918f, 0.797320664f},
+{0.619093955f, 0.785316944f}, {0.634393275f, 0.773010433f},
+{0.649448037f, 0.760405958f}, {0.664252460f, 0.747508347f},
+{0.678800762f, 0.734322488f}, {0.693087339f, 0.720853567f},
+{0.707106769f, 0.707106769f}, {0.720853567f, 0.693087339f},
+{0.734322488f, 0.678800762f}, {0.747508347f, 0.664252460f},
+{0.760405958f, 0.649448037f}, {0.773010433f, 0.634393275f},
+{0.785316944f, 0.619093955f}, {0.797320664f, 0.603555918f},
+{0.809017003f, 0.587785244f}, {0.820401430f, 0.571787953f},
+{0.831469595f, 0.555570245f}, {0.842217207f, 0.539138317f},
+{0.852640152f, 0.522498548f}, {0.862734377f, 0.505657375f},
+{0.872496009f, 0.488621235f}, {0.881921291f, 0.471396744f},
+{0.891006529f, 0.453990489f}, {0.899748266f, 0.436409235f},
+{0.908143163f, 0.418659747f}, {0.916187942f, 0.400748819f},
+{0.923879504f, 0.382683426f}, {0.931214929f, 0.364470512f},
+{0.938191354f, 0.346117049f}, {0.944806039f, 0.327630192f},
+{0.951056540f, 0.309017003f}, {0.956940353f, 0.290284663f},
+{0.962455213f, 0.271440446f}, {0.967599094f, 0.252491564f},
+{0.972369909f, 0.233445361f}, {0.976765871f, 0.214309156f},
+{0.980785251f, 0.195090324f}, {0.984426558f, 0.175796285f},
+{0.987688363f, 0.156434461f}, {0.990569353f, 0.137012348f},
+{0.993068457f, 0.117537394f}, {0.995184720f, 0.0980171412f},
+{0.996917307f, 0.0784590989f}, {0.998265624f, 0.0588708036f},
+{0.999229014f, 0.0392598175f}, {0.999807239f, 0.0196336918f},
+};
+
+const kiss_fft_state kfft = {
+320, /* nfft */
+0.0031250000f, /* scale */
+-1, /* shift */
+{5, 64, 4, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */
+fft_bitrev, /* bitrev*/
+fft_twiddles, /* twiddles*/
+(arch_fft_state *)&arch_fft, /* arch_fft*/
+};
+
+const float half_window[] = {
+3.78491532e-05f, 0.000340620492f, 0.000946046319f, 0.00185389258f, 0.00306380726f,
+0.00457531959f, 0.00638783723f, 0.00850064680f, 0.0109129101f, 0.0136236614f,
+0.0166318044f, 0.0199361145f, 0.0235352255f, 0.0274276342f, 0.0316116922f,
+0.0360856056f, 0.0408474281f, 0.0458950549f, 0.0512262285f, 0.0568385124f,
+0.0627293140f, 0.0688958541f, 0.0753351897f, 0.0820441842f, 0.0890194997f,
+0.0962576419f, 0.103754878f, 0.111507311f, 0.119510807f, 0.127761051f,
+0.136253506f, 0.144983411f, 0.153945804f, 0.163135484f, 0.172547072f,
+0.182174906f, 0.192013159f, 0.202055752f, 0.212296382f, 0.222728521f,
+0.233345464f, 0.244140238f, 0.255105674f, 0.266234398f, 0.277518868f,
+0.288951218f, 0.300523549f, 0.312227666f, 0.324055225f, 0.335997701f,
+0.348046392f, 0.360192508f, 0.372427016f, 0.384740859f, 0.397124738f,
+0.409569323f, 0.422065198f, 0.434602767f, 0.447172493f, 0.459764689f,
+0.472369671f, 0.484977663f, 0.497579008f, 0.510163903f, 0.522722721f,
+0.535245717f, 0.547723293f, 0.560145974f, 0.572504222f, 0.584788740f,
+0.596990347f, 0.609099925f, 0.621108532f, 0.633007407f, 0.644788086f,
+0.656442165f, 0.667961538f, 0.679338276f, 0.690564752f, 0.701633692f,
+0.712537885f, 0.723270535f, 0.733825266f, 0.744195819f, 0.754376352f,
+0.764361382f, 0.774145722f, 0.783724606f, 0.793093503f, 0.802248418f,
+0.811185598f, 0.819901764f, 0.828393936f, 0.836659551f, 0.844696403f,
+0.852502763f, 0.860077202f, 0.867418647f, 0.874526560f, 0.881400526f,
+0.888040781f, 0.894447744f, 0.900622249f, 0.906565487f, 0.912279010f,
+0.917764664f, 0.923024654f, 0.928061485f, 0.932878017f, 0.937477291f,
+0.941862822f, 0.946038187f, 0.950007319f, 0.953774393f, 0.957343817f,
+0.960720181f, 0.963908315f, 0.966913164f, 0.969739914f, 0.972393870f,
+0.974880517f, 0.977205336f, 0.979374051f, 0.981392324f, 0.983266115f,
+0.985001266f, 0.986603677f, 0.988079309f, 0.989434063f, 0.990674019f,
+0.991804957f, 0.992832899f, 0.993763626f, 0.994602919f, 0.995356441f,
+0.996029854f, 0.996628702f, 0.997158289f, 0.997623861f, 0.998030603f,
+0.998383403f, 0.998687088f, 0.998946249f, 0.999165416f, 0.999348700f,
+0.999500215f, 0.999623775f, 0.999723017f, 0.999801278f, 0.999861658f,
+0.999907196f, 0.999940455f, 0.999963880f, 0.999979615f, 0.999989510f,
+0.999995291f, 0.999998271f, 0.999999523f, 0.999999940f, 1.00000000f,
+};
+
+const float dct_table[] = {
+0.707106769f, 0.996194720f, 0.984807730f, 0.965925813f, 0.939692616f,
+0.906307817f, 0.866025388f, 0.819152057f, 0.766044438f, 0.707106769f,
+0.642787635f, 0.573576450f, 0.500000000f, 0.422618270f, 0.342020154f,
+0.258819044f, 0.173648179f, 0.0871557444f, 0.707106769f, 0.965925813f,
+0.866025388f, 0.707106769f, 0.500000000f, 0.258819044f, 6.12323426e-17f,
+-0.258819044f, -0.500000000f, -0.707106769f, -0.866025388f, -0.965925813f,
+-1.00000000f, -0.965925813f, -0.866025388f, -0.707106769f, -0.500000000f,
+-0.258819044f, 0.707106769f, 0.906307817f, 0.642787635f, 0.258819044f,
+-0.173648179f, -0.573576450f, -0.866025388f, -0.996194720f, -0.939692616f,
+-0.707106769f, -0.342020154f, 0.0871557444f, 0.500000000f, 0.819152057f,
+0.984807730f, 0.965925813f, 0.766044438f, 0.422618270f, 0.707106769f,
+0.819152057f, 0.342020154f, -0.258819044f, -0.766044438f, -0.996194720f,
+-0.866025388f, -0.422618270f, 0.173648179f, 0.707106769f, 0.984807730f,
+0.906307817f, 0.500000000f, -0.0871557444f, -0.642787635f, -0.965925813f,
+-0.939692616f, -0.573576450f, 0.707106769f, 0.707106769f, 6.12323426e-17f,
+-0.707106769f, -1.00000000f, -0.707106769f, -1.83697015e-16f, 0.707106769f,
+1.00000000f, 0.707106769f, 3.06161700e-16f, -0.707106769f, -1.00000000f,
+-0.707106769f, -4.28626385e-16f, 0.707106769f, 1.00000000f, 0.707106769f,
+0.707106769f, 0.573576450f, -0.342020154f, -0.965925813f, -0.766044438f,
+0.0871557444f, 0.866025388f, 0.906307817f, 0.173648179f, -0.707106769f,
+-0.984807730f, -0.422618270f, 0.500000000f, 0.996194720f, 0.642787635f,
+-0.258819044f, -0.939692616f, -0.819152057f, 0.707106769f, 0.422618270f,
+-0.642787635f, -0.965925813f, -0.173648179f, 0.819152057f, 0.866025388f,
+-0.0871557444f, -0.939692616f, -0.707106769f, 0.342020154f, 0.996194720f,
+0.500000000f, -0.573576450f, -0.984807730f, -0.258819044f, 0.766044438f,
+0.906307817f, 0.707106769f, 0.258819044f, -0.866025388f, -0.707106769f,
+0.500000000f, 0.965925813f, 3.06161700e-16f, -0.965925813f, -0.500000000f,
+0.707106769f, 0.866025388f, -0.258819044f, -1.00000000f, -0.258819044f,
+0.866025388f, 0.707106769f, -0.500000000f, -0.965925813f, 0.707106769f,
+0.0871557444f, -0.984807730f, -0.258819044f, 0.939692616f, 0.422618270f,
+-0.866025388f, -0.573576450f, 0.766044438f, 0.707106769f, -0.642787635f,
+-0.819152057f, 0.500000000f, 0.906307817f, -0.342020154f, -0.965925813f,
+0.173648179f, 0.996194720f, 0.707106769f, -0.0871557444f, -0.984807730f,
+0.258819044f, 0.939692616f, -0.422618270f, -0.866025388f, 0.573576450f,
+0.766044438f, -0.707106769f, -0.642787635f, 0.819152057f, 0.500000000f,
+-0.906307817f, -0.342020154f, 0.965925813f, 0.173648179f, -0.996194720f,
+0.707106769f, -0.258819044f, -0.866025388f, 0.707106769f, 0.500000000f,
+-0.965925813f, -4.28626385e-16f, 0.965925813f, -0.500000000f, -0.707106769f,
+0.866025388f, 0.258819044f, -1.00000000f, 0.258819044f, 0.866025388f,
+-0.707106769f, -0.500000000f, 0.965925813f, 0.707106769f, -0.422618270f,
+-0.642787635f, 0.965925813f, -0.173648179f, -0.819152057f, 0.866025388f,
+0.0871557444f, -0.939692616f, 0.707106769f, 0.342020154f, -0.996194720f,
+0.500000000f, 0.573576450f, -0.984807730f, 0.258819044f, 0.766044438f,
+-0.906307817f, 0.707106769f, -0.573576450f, -0.342020154f, 0.965925813f,
+-0.766044438f, -0.0871557444f, 0.866025388f, -0.906307817f, 0.173648179f,
+0.707106769f, -0.984807730f, 0.422618270f, 0.500000000f, -0.996194720f,
+0.642787635f, 0.258819044f, -0.939692616f, 0.819152057f, 0.707106769f,
+-0.707106769f, -1.83697015e-16f, 0.707106769f, -1.00000000f, 0.707106769f,
+5.51091070e-16f, -0.707106769f, 1.00000000f, -0.707106769f, -2.69484189e-15f,
+0.707106769f, -1.00000000f, 0.707106769f, -4.90477710e-16f, -0.707106769f,
+1.00000000f, -0.707106769f, 0.707106769f, -0.819152057f, 0.342020154f,
+0.258819044f, -0.766044438f, 0.996194720f, -0.866025388f, 0.422618270f,
+0.173648179f, -0.707106769f, 0.984807730f, -0.906307817f, 0.500000000f,
+0.0871557444f, -0.642787635f, 0.965925813f, -0.939692616f, 0.573576450f,
+0.707106769f, -0.906307817f, 0.642787635f, -0.258819044f, -0.173648179f,
+0.573576450f, -0.866025388f, 0.996194720f, -0.939692616f, 0.707106769f,
+-0.342020154f, -0.0871557444f, 0.500000000f, -0.819152057f, 0.984807730f,
+-0.965925813f, 0.766044438f, -0.422618270f, 0.707106769f, -0.965925813f,
+0.866025388f, -0.707106769f, 0.500000000f, -0.258819044f, 1.10280111e-15f,
+0.258819044f, -0.500000000f, 0.707106769f, -0.866025388f, 0.965925813f,
+-1.00000000f, 0.965925813f, -0.866025388f, 0.707106769f, -0.500000000f,
+0.258819044f, 0.707106769f, -0.996194720f, 0.984807730f, -0.965925813f,
+0.939692616f, -0.906307817f, 0.866025388f, -0.819152057f, 0.766044438f,
+-0.707106769f, 0.642787635f, -0.573576450f, 0.500000000f, -0.422618270f,
+0.342020154f, -0.258819044f, 0.173648179f, -0.0871557444f, };
diff --git a/dnn/m4/attributes.m4 b/dnn/m4/attributes.m4
new file mode 100644
index 00000000..ebc7347b
--- /dev/null
+++ b/dnn/m4/attributes.m4
@@ -0,0 +1,321 @@
+dnl Macros to check the presence of generic (non-typed) symbols.
+dnl Copyright (c) 2006-2007 Diego Pettenò <flameeyes@gmail.com>
+dnl Copyright (c) 2006-2007 xine project
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2, or (at your option)
+dnl any later version.
+dnl
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+dnl 02110-1301, USA.
+dnl
+dnl As a special exception, the copyright owners of the
+dnl macro gives unlimited permission to copy, distribute and modify the
+dnl configure scripts that are the output of Autoconf when processing the
+dnl Macro. You need not follow the terms of the GNU General Public
+dnl License when using or distributing such scripts, even though portions
+dnl of the text of the Macro appear in them. The GNU General Public
+dnl License (GPL) does govern all other use of the material that
+dnl constitutes the Autoconf Macro.
+dnl
+dnl This special exception to the GPL applies to versions of the
+dnl Autoconf Macro released by this project. When you make and
+dnl distribute a modified version of the Autoconf Macro, you may extend
+dnl this special exception to the GPL to apply to your modified version as
+dnl well.
+
+dnl Check if the flag is supported by compiler
+dnl CC_CHECK_CFLAGS_SILENT([FLAG], [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
+
+AC_DEFUN([CC_CHECK_CFLAGS_SILENT], [
+  AC_CACHE_VAL(AS_TR_SH([cc_cv_cflags_$1]),
+    [ac_save_CFLAGS="$CFLAGS"
+     CFLAGS="$CFLAGS $1"
+     AC_LINK_IFELSE([AC_LANG_SOURCE([int main() { return 0; }])],
+       [eval "AS_TR_SH([cc_cv_cflags_$1])='yes'"],
+       [eval "AS_TR_SH([cc_cv_cflags_$1])='no'"])
+     CFLAGS="$ac_save_CFLAGS"
+    ])
+
+  AS_IF([eval test x$]AS_TR_SH([cc_cv_cflags_$1])[ = xyes],
+    [$2], [$3])
+])
+
+dnl Check if the flag is supported by compiler (cacheable)
+dnl CC_CHECK_CFLAGS([FLAG], [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
+
+AC_DEFUN([CC_CHECK_CFLAGS], [
+  AC_CACHE_CHECK([if $CC supports $1 flag],
+    AS_TR_SH([cc_cv_cflags_$1]),
+    CC_CHECK_CFLAGS_SILENT([$1]) dnl Don't execute actions here!
+  )
+
+  AS_IF([eval test x$]AS_TR_SH([cc_cv_cflags_$1])[ = xyes],
+    [$2], [$3])
+])
+
+dnl CC_CHECK_CFLAG_APPEND(FLAG, [action-if-found], [action-if-not-found])
+dnl Check for CFLAG and appends them to CFLAGS if supported
+AC_DEFUN([CC_CHECK_CFLAG_APPEND], [
+  AC_CACHE_CHECK([if $CC supports $1 flag],
+    AS_TR_SH([cc_cv_cflags_$1]),
+    CC_CHECK_CFLAGS_SILENT([$1]) dnl Don't execute actions here!
+  )
+
+  AS_IF([eval test x$]AS_TR_SH([cc_cv_cflags_$1])[ = xyes],
+    [CFLAGS="$CFLAGS $1"; $2], [$3])
+])
+
+dnl CC_CHECK_CFLAGS_APPEND([FLAG1 FLAG2], [action-if-found], [action-if-not])
+AC_DEFUN([CC_CHECK_CFLAGS_APPEND], [
+  for flag in $1; do
+    CC_CHECK_CFLAG_APPEND($flag, [$2], [$3])
+  done
+])
+
+dnl Check if the flag is supported by linker (cacheable)
+dnl CC_CHECK_LDFLAGS([FLAG], [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
+
+AC_DEFUN([CC_CHECK_LDFLAGS], [
+  AC_CACHE_CHECK([if $CC supports $1 flag],
+    AS_TR_SH([cc_cv_ldflags_$1]),
+    [ac_save_LDFLAGS="$LDFLAGS"
+     LDFLAGS="$LDFLAGS $1"
+     AC_LINK_IFELSE([AC_LANG_SOURCE([int main() { return 1; }])],
+       [eval "AS_TR_SH([cc_cv_ldflags_$1])='yes'"],
+       [eval "AS_TR_SH([cc_cv_ldflags_$1])="])
+     LDFLAGS="$ac_save_LDFLAGS"
+    ])
+
+  AS_IF([eval test x$]AS_TR_SH([cc_cv_ldflags_$1])[ = xyes],
+    [$2], [$3])
+])
+
+dnl define the LDFLAGS_NOUNDEFINED variable with the correct value for
+dnl the current linker to avoid undefined references in a shared object.
+AC_DEFUN([CC_NOUNDEFINED], [
+  dnl We check $host for which systems to enable this for.
+  AC_REQUIRE([AC_CANONICAL_HOST])
+
+  case $host in
+     dnl FreeBSD (et al.) does not complete linking for shared objects when pthreads
+     dnl are requested, as different implementations are present; to avoid problems
+     dnl use -Wl,-z,defs only for those platform not behaving this way.
+     *-freebsd* | *-openbsd*) ;;
+     *)
+        dnl First of all check for the --no-undefined variant of GNU ld. This allows
+        dnl for a much more readable commandline, so that people can understand what
+        dnl it does without going to look for what the heck -z defs does.
+        for possible_flags in "-Wl,--no-undefined" "-Wl,-z,defs"; do
+          CC_CHECK_LDFLAGS([$possible_flags], [LDFLAGS_NOUNDEFINED="$possible_flags"])
+	  break
+        done
+	;;
+  esac
+
+  AC_SUBST([LDFLAGS_NOUNDEFINED])
+])
+
+dnl Check for a -Werror flag or equivalent. -Werror is the GCC
+dnl and ICC flag that tells the compiler to treat all the warnings
+dnl as fatal. We usually need this option to make sure that some
+dnl constructs (like attributes) are not simply ignored.
+dnl
+dnl Other compilers don't support -Werror per se, but they support
+dnl an equivalent flag:
+dnl  - Sun Studio compiler supports -errwarn=%all
+AC_DEFUN([CC_CHECK_WERROR], [
+  AC_CACHE_CHECK(
+    [for $CC way to treat warnings as errors],
+    [cc_cv_werror],
+    [CC_CHECK_CFLAGS_SILENT([-Werror], [cc_cv_werror=-Werror],
+      [CC_CHECK_CFLAGS_SILENT([-errwarn=%all], [cc_cv_werror=-errwarn=%all])])
+    ])
+])
+
+AC_DEFUN([CC_CHECK_ATTRIBUTE], [
+  AC_REQUIRE([CC_CHECK_WERROR])
+  AC_CACHE_CHECK([if $CC supports __attribute__(( ifelse([$2], , [$1], [$2]) ))],
+    AS_TR_SH([cc_cv_attribute_$1]),
+    [ac_save_CFLAGS="$CFLAGS"
+     CFLAGS="$CFLAGS $cc_cv_werror"
+     AC_COMPILE_IFELSE([AC_LANG_SOURCE([$3])],
+       [eval "AS_TR_SH([cc_cv_attribute_$1])='yes'"],
+       [eval "AS_TR_SH([cc_cv_attribute_$1])='no'"])
+     CFLAGS="$ac_save_CFLAGS"
+    ])
+
+  AS_IF([eval test x$]AS_TR_SH([cc_cv_attribute_$1])[ = xyes],
+    [AC_DEFINE(
+       AS_TR_CPP([SUPPORT_ATTRIBUTE_$1]), 1,
+         [Define this if the compiler supports __attribute__(( ifelse([$2], , [$1], [$2]) ))]
+         )
+     $4],
+    [$5])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_CONSTRUCTOR], [
+  CC_CHECK_ATTRIBUTE(
+    [constructor],,
+    [extern void foo();
+     void __attribute__((constructor)) ctor() { foo(); }],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_DESTRUCTOR], [
+  CC_CHECK_ATTRIBUTE(
+    [destructor],,
+    [extern void foo();
+     void __attribute__((destructor)) dtor() { foo(); }],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_FORMAT], [
+  CC_CHECK_ATTRIBUTE(
+    [format], [format(printf, n, n)],
+    [void __attribute__((format(printf, 1, 2))) printflike(const char *fmt, ...) { fmt = (void *)0; }],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_FORMAT_ARG], [
+  CC_CHECK_ATTRIBUTE(
+    [format_arg], [format_arg(printf)],
+    [char *__attribute__((format_arg(1))) gettextlike(const char *fmt) { fmt = (void *)0; }],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_VISIBILITY], [
+  CC_CHECK_ATTRIBUTE(
+    [visibility_$1], [visibility("$1")],
+    [void __attribute__((visibility("$1"))) $1_function() { }],
+    [$2], [$3])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_NONNULL], [
+  CC_CHECK_ATTRIBUTE(
+    [nonnull], [nonnull()],
+    [void __attribute__((nonnull())) some_function(void *foo, void *bar) { foo = (void*)0; bar = (void*)0; }],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_UNUSED], [
+  CC_CHECK_ATTRIBUTE(
+    [unused], ,
+    [void some_function(void *foo, __attribute__((unused)) void *bar);],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_SENTINEL], [
+  CC_CHECK_ATTRIBUTE(
+    [sentinel], ,
+    [void some_function(void *foo, ...) __attribute__((sentinel));],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_DEPRECATED], [
+  CC_CHECK_ATTRIBUTE(
+    [deprecated], ,
+    [void some_function(void *foo, ...) __attribute__((deprecated));],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_ALIAS], [
+  CC_CHECK_ATTRIBUTE(
+    [alias], [weak, alias],
+    [void other_function(void *foo) { }
+     void some_function(void *foo) __attribute__((weak, alias("other_function")));],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_MALLOC], [
+  CC_CHECK_ATTRIBUTE(
+    [malloc], ,
+    [void * __attribute__((malloc)) my_alloc(int n);],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_PACKED], [
+  CC_CHECK_ATTRIBUTE(
+    [packed], ,
+    [struct astructure { char a; int b; long c; void *d; } __attribute__((packed));
+     char assert@<:@(sizeof(struct astructure) == (sizeof(char)+sizeof(int)+sizeof(long)+sizeof(void*)))-1@:>@;],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_CONST], [
+  CC_CHECK_ATTRIBUTE(
+    [const], ,
+    [int __attribute__((const)) twopow(int n) { return 1 << n; } ],
+    [$1], [$2])
+])
+
+AC_DEFUN([CC_FLAG_VISIBILITY], [
+  AC_REQUIRE([CC_CHECK_WERROR])
+  AC_CACHE_CHECK([if $CC supports -fvisibility=hidden],
+    [cc_cv_flag_visibility],
+    [cc_flag_visibility_save_CFLAGS="$CFLAGS"
+     CFLAGS="$CFLAGS $cc_cv_werror"
+     CC_CHECK_CFLAGS_SILENT([-fvisibility=hidden],
+	cc_cv_flag_visibility='yes',
+	cc_cv_flag_visibility='no')
+     CFLAGS="$cc_flag_visibility_save_CFLAGS"])
+
+  AS_IF([test "x$cc_cv_flag_visibility" = "xyes"],
+    [AC_DEFINE([SUPPORT_FLAG_VISIBILITY], 1,
+       [Define this if the compiler supports the -fvisibility flag])
+     $1],
+    [$2])
+])
+
+AC_DEFUN([CC_FUNC_EXPECT], [
+  AC_REQUIRE([CC_CHECK_WERROR])
+  AC_CACHE_CHECK([if compiler has __builtin_expect function],
+    [cc_cv_func_expect],
+    [ac_save_CFLAGS="$CFLAGS"
+     CFLAGS="$CFLAGS $cc_cv_werror"
+     AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+       [int some_function() {
+        int a = 3;
+        return (int)__builtin_expect(a, 3);
+	}])],
+       [cc_cv_func_expect=yes],
+       [cc_cv_func_expect=no])
+     CFLAGS="$ac_save_CFLAGS"
+    ])
+
+  AS_IF([test "x$cc_cv_func_expect" = "xyes"],
+    [AC_DEFINE([SUPPORT__BUILTIN_EXPECT], 1,
+     [Define this if the compiler supports __builtin_expect() function])
+     $1],
+    [$2])
+])
+
+AC_DEFUN([CC_ATTRIBUTE_ALIGNED], [
+  AC_REQUIRE([CC_CHECK_WERROR])
+  AC_CACHE_CHECK([highest __attribute__ ((aligned ())) supported],
+    [cc_cv_attribute_aligned],
+    [ac_save_CFLAGS="$CFLAGS"
+     CFLAGS="$CFLAGS $cc_cv_werror"
+     for cc_attribute_align_try in 64 32 16 8 4 2; do
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+          int main() {
+            static char c __attribute__ ((aligned($cc_attribute_align_try))) = 0;
+            return c;
+          }])], [cc_cv_attribute_aligned=$cc_attribute_align_try; break])
+     done
+     CFLAGS="$ac_save_CFLAGS"
+  ])
+
+  if test "x$cc_cv_attribute_aligned" != "x"; then
+     AC_DEFINE_UNQUOTED([ATTRIBUTE_ALIGNED_MAX], [$cc_cv_attribute_aligned],
+       [Define the highest alignment supported])
+  fi
+])
diff --git a/dnn/nnet.c b/dnn/nnet.c
new file mode 100644
index 00000000..580496ee
--- /dev/null
+++ b/dnn/nnet.c
@@ -0,0 +1,503 @@
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "opus_types.h"
+#include "arch.h"
+#include "common.h"
+#include "tansig_table.h"
+#include "nnet.h"
+#include "nnet_data.h"
+#include "dred_rdovae_constants.h"
+#include "plc_data.h"
+
+#ifdef NO_OPTIMIZATIONS
+#if defined(_MSC_VER)
+#pragma message ("Compiling without any vectorization. This code will be very slow")
+#else
+#warning Compiling without any vectorization. This code will be very slow
+#endif
+#endif
+
+
+#define SOFTMAX_HACK
+
+#define MAX_ACTIVATIONS (4096)
+
+static OPUS_INLINE void vec_swish(float *y, const float *x, int N)
+{
+   int i;
+   float tmp[MAX_ACTIVATIONS];
+   celt_assert(N <= MAX_ACTIVATIONS);
+   vec_sigmoid(tmp, x, N);
+   for (i=0;i<N;i++)
+      y[i] = x[i]*tmp[i];
+}
+
+static OPUS_INLINE float relu(float x)
+{
+   return x < 0 ? 0 : x;
+}
+
+
+static void sgemv_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+   int i, j;
+   if (rows % 16 == 0)
+   {
+      sgemv_accum16(out, weights, rows, cols, col_stride, x);
+   } else {
+      for (i=0;i<rows;i++)
+      {
+         for (j=0;j<cols;j++)
+            out[i] += weights[j*col_stride + i]*x[j];
+      }
+   }
+}
+
+void compute_activation(float *output, const float *input, int N, int activation)
+{
+   int i;
+   if (activation == ACTIVATION_SIGMOID) {
+      vec_sigmoid(output, input, N);
+   } else if (activation == ACTIVATION_TANH) {
+      vec_tanh(output, input, N);
+   } else if (activation == ACTIVATION_SWISH) {
+      vec_swish(output, input, N);
+   } else if (activation == ACTIVATION_RELU) {
+      for (i=0;i<N;i++)
+         output[i] = relu(input[i]);
+   } else if (activation == ACTIVATION_SOFTMAX) {
+#ifdef SOFTMAX_HACK
+      RNN_COPY(output, input, N);
+      /*for (i=0;i<N;i++)
+         output[i] = input[i];*/
+#else
+      float sum = 0;
+      softmax(output, input, N);
+      for (i=0;i<N;i++) {
+         sum += output[i];
+      }
+      sum = 1.f/(sum+1e-30);
+      for (i=0;i<N;i++)
+         output[i] = sum*output[i];
+#endif
+   } else {
+      celt_assert(activation == ACTIVATION_LINEAR);
+      for (i=0;i<N;i++)
+         output[i] = input[i];
+   }
+}
+
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input)
+{
+   int i;
+   int N, M;
+   int stride;
+   M = layer->nb_inputs;
+   N = layer->nb_neurons;
+   stride = N;
+   celt_assert(input != output);
+   for (i=0;i<N;i++)
+      output[i] = layer->bias[i];
+   sgemv_accum(output, layer->input_weights, N, M, stride, input);
+   compute_activation(output, output, N, layer->activation);
+}
+
+void compute_mdense(const MDenseLayer *layer, float *output, const float *input)
+{
+   int i, c;
+   int N, M, C;
+   int stride;
+   float tmp[MAX_MDENSE_TMP];
+   celt_assert(input != output);
+   M = layer->nb_inputs;
+   N = layer->nb_neurons;
+   C = layer->nb_channels;
+   celt_assert(N*C <= MAX_MDENSE_TMP);
+   stride = N*C;
+   for (i=0;i<N*C;i++)
+      tmp[i] = layer->bias[i];
+   sgemv_accum(tmp, layer->input_weights, N*C, M, stride, input);
+   compute_activation(tmp, tmp, N*C, ACTIVATION_TANH);
+   for (i=0;i<N;i++)
+      output[i] = 0;
+   for (c=0;c<C;c++)
+   {
+      for (i=0;i<N;i++)
+         output[i] += tmp[c*N + i]*layer->factor[c*N + i];
+   }
+   compute_activation(output, output, N, layer->activation);
+}
+
+int sample_mdense(const MDenseLayer *layer, const float *input, const float *sampling_logit_table, kiss99_ctx *rng)
+{
+   int b, j, N, M, C, stride;
+   int val=0;
+   float thresholds[8];
+   M = layer->nb_inputs;
+   N = layer->nb_neurons;
+   C = layer->nb_channels;
+   celt_assert(N*C <= MAX_MDENSE_TMP);
+   stride = M*C;
+   
+   celt_assert(N <= DUAL_FC_OUT_SIZE);
+
+   /* Computing all the random thresholds in advance. These thresholds are directly
+      based on the logit to avoid computing the sigmoid.*/
+   for (b=0;b<8;b+=4) {
+       uint32_t r = kiss99_rand(rng);
+       thresholds[b] = sampling_logit_table[r&0xFF];
+       thresholds[b+1] = sampling_logit_table[(r>>8)&0xFF];
+       thresholds[b+2] = sampling_logit_table[(r>>16)&0xFF];
+       thresholds[b+3] = sampling_logit_table[(r>>24)&0xFF];
+   }
+
+   for (b=0;b<8;b++)
+   {
+      int bit;
+      int i;
+      float sum1, sum2;
+      
+      i = (1<<b) | val;
+
+      sum1 = layer->bias[i];
+      sum2 = layer->bias[i + N];
+      for (j=0;j<M;j++) {
+         sum1 += layer->input_weights[i*stride + j]*input[j];
+         sum2 += layer->input_weights[i*stride + j + M]*input[j];
+      }
+      sum1 = layer->factor[i]*tanh_approx(sum1);
+      sum2 = layer->factor[N + i]*tanh_approx(sum2);
+      sum1 += sum2;
+      /*sum1 = 1.f/(1 + exp(-sum1));*/
+#if 1 /* Sample the decision based on the logit. */
+      bit = thresholds[b] < sum1;
+#else
+      sum1 = sigmoid_approx(sum1);
+      bit = .025+.95*((rand()+.5f)/(RAND_MAX+1.f)) < sum1;
+#endif
+      val = (val << 1) | bit;
+   }
+   return val;
+
+}
+
+
+#if 0
+void compute_gru(const GRULayer *gru, float *state, const float *input)
+{
+   int i;
+   int N, M;
+   int stride;
+   float tmp[MAX_RNN_NEURONS];
+   float z[MAX_RNN_NEURONS];
+   float r[MAX_RNN_NEURONS];
+   float h[MAX_RNN_NEURONS];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+   celt_assert(input != state);
+   M = gru->nb_inputs;
+   N = gru->nb_neurons;
+   stride = 3*N;
+   /* Compute update gate. */
+   for (i=0;i<N;i++)
+      z[i] = gru->bias[i];
+   if (gru->reset_after)
+   {
+      for (i=0;i<N;i++)
+         z[i] += gru->bias[3*N + i];
+   }
+   sgemv_accum(z, gru->input_weights, N, M, stride, input);
+   sgemv_accum(z, gru->recurrent_weights, N, N, stride, state);
+   compute_activation(z, z, N, ACTIVATION_SIGMOID);
+
+   /* Compute reset gate. */
+   for (i=0;i<N;i++)
+      r[i] = gru->bias[N + i];
+   if (gru->reset_after)
+   {
+      for (i=0;i<N;i++)
+         r[i] += gru->bias[4*N + i];
+   }
+   sgemv_accum(r, &gru->input_weights[N], N, M, stride, input);
+   sgemv_accum(r, &gru->recurrent_weights[N], N, N, stride, state);
+   compute_activation(r, r, N, ACTIVATION_SIGMOID);
+
+   /* Compute output. */
+   for (i=0;i<N;i++)
+      h[i] = gru->bias[2*N + i];
+   if (gru->reset_after)
+   {
+      for (i=0;i<N;i++)
+         tmp[i] = gru->bias[5*N + i];
+      sgemv_accum(tmp, &gru->recurrent_weights[2*N], N, N, stride, state);
+      for (i=0;i<N;i++)
+         h[i] += tmp[i] * r[i];
+      sgemv_accum(h, &gru->input_weights[2*N], N, M, stride, input);
+   } else {
+      for (i=0;i<N;i++)
+         tmp[i] = state[i] * r[i];
+      sgemv_accum(h, &gru->input_weights[2*N], N, M, stride, input);
+      sgemv_accum(h, &gru->recurrent_weights[2*N], N, N, stride, tmp);
+   }
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*h[i];
+   for (i=0;i<N;i++)
+      state[i] = h[i];
+}
+#endif
+
+void compute_gru2(const GRULayer *gru, float *state, const float *input)
+{
+   int i;
+   int N, M;
+   int stride;
+   float zrh[3*MAX_RNN_NEURONS];
+   float recur[3*MAX_RNN_NEURONS];
+   float *z;
+   float *r;
+   float *h;
+   M = gru->nb_inputs;
+   N = gru->nb_neurons;
+   z = zrh;
+   r = &zrh[N];
+   h = &zrh[2*N];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+   celt_assert(input != state);
+   celt_assert(gru->reset_after);
+   stride = 3*N;
+   /* Compute update gate. */
+#ifdef USE_SU_BIAS
+   for (i=0;i<3*N;i++)
+      zrh[i] = gru->subias[i];
+#else
+   for (i=0;i<3*N;i++)
+      zrh[i] = gru->bias[i];
+#endif
+   sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input);
+   for (i=0;i<3*N;i++)
+      recur[i] = gru->bias[3*N + i];
+   sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, N, stride, state);
+   for (i=0;i<2*N;i++)
+      zrh[i] += recur[i];
+   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+   for (i=0;i<N;i++)
+      h[i] += recur[2*N+i]*r[i];
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*h[i];
+   for (i=0;i<N;i++)
+      state[i] = h[i];
+}
+
+#define MAX_RNN_NEURONS_ALL IMAX(IMAX(MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
+
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
+{
+   int i;
+   int N, M;
+   int stride;
+   float zrh[3*MAX_RNN_NEURONS_ALL];
+   float recur[3*MAX_RNN_NEURONS_ALL];
+   float *z;
+   float *r;
+   float *h;
+   M = gru->nb_inputs;
+   N = gru->nb_neurons;
+   z = zrh;
+   r = &zrh[N];
+   h = &zrh[2*N];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS_ALL);
+   celt_assert(input != state);
+   celt_assert(gru->reset_after);
+   stride = 3*N;
+   /* Compute update gate. */
+#ifdef USE_SU_BIAS
+   for (i=0;i<3*N;i++)
+      zrh[i] = gru->subias[i] + gru_b_condition[i];
+#else
+   for (i=0;i<3*N;i++)
+      zrh[i] = gru->bias[i] + gru_b_condition[i];
+#endif
+   sparse_sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, gru->input_weights_idx, input);
+#ifdef USE_SU_BIAS
+   for (i=0;i<3*N;i++)
+      recur[i] = gru->subias[3*N + i];
+#else
+   for (i=0;i<3*N;i++)
+      recur[i] = gru->bias[3*N + i];
+#endif
+   sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, N, stride, state);
+   for (i=0;i<2*N;i++)
+      zrh[i] += recur[i];
+   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+   for (i=0;i<N;i++)
+      h[i] += recur[2*N+i]*r[i];
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*h[i];
+   for (i=0;i<N;i++)
+      state[i] = h[i];
+}
+
+
+void compute_gru3(const GRULayer *gru, float *state, const float *input)
+{
+   int i;
+   int N;
+   int stride;
+   float zrh[3*MAX_RNN_NEURONS_ALL];
+   float recur[3*MAX_RNN_NEURONS_ALL];
+   float *z;
+   float *r;
+   float *h;
+   N = gru->nb_neurons;
+   z = zrh;
+   r = &zrh[N];
+   h = &zrh[2*N];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS_ALL);
+   celt_assert(input != state);
+   celt_assert(gru->reset_after);
+   stride = 3*N;
+   RNN_COPY(zrh, input, 3*N);
+   for (i=0;i<3*N;i++)
+      recur[i] = gru->bias[3*N + i];
+   sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, N, stride, state);
+   for (i=0;i<2*N;i++)
+      zrh[i] += recur[i];
+   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+   for (i=0;i<N;i++)
+      h[i] += recur[2*N+i]*r[i];
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*h[i];
+   for (i=0;i<N;i++)
+      state[i] = h[i];
+}
+
+/* The input of this GRU is after the input matrix multiply. */
+void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input)
+{
+   int i, k;
+   int N;
+   float recur[3*MAX_RNN_NEURONS_ALL];
+   float *z;
+   float *r;
+   float *h;
+   const float *bias;
+   N = gru->nb_neurons;
+   z = recur;
+   r = &recur[N];
+   h = &recur[2*N];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS_ALL);
+   celt_assert(input != state);
+   celt_assert(gru->reset_after);
+#ifdef USE_SU_BIAS
+   bias = &gru->subias[3*N];
+#else
+   bias = &gru->bias[3*N];   
+#endif
+   for (k=0;k<2;k++)
+   {
+      for (i=0;i<N;i++)
+         recur[k*N + i] = bias[k*N + i] + gru->diag_weights[k*N + i]*state[i] + input[k*N + i];
+   }
+   for (;k<3;k++)
+   {
+      for (i=0;i<N;i++)
+         recur[k*N + i] = bias[k*N + i] + gru->diag_weights[k*N + i]*state[i];
+   }
+   sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, N, gru->idx, state);
+   compute_activation(recur, recur, 2*N, ACTIVATION_SIGMOID);
+   for (i=0;i<N;i++)
+      h[i] = h[i]*r[i] + input[2*N+i];
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      state[i] = z[i]*state[i] + (1-z[i])*h[i];
+}
+
+#define MAX_CONV_INPUTS_ALL IMAX(MAX_CONV_INPUTS, DRED_MAX_CONV_INPUTS)
+
+void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input)
+{
+   int i;
+   int N, M;
+   int stride;
+   float tmp[MAX_CONV_INPUTS_ALL];
+   celt_assert(input != output);
+   celt_assert(layer->nb_inputs*layer->kernel_size <= MAX_CONV_INPUTS_ALL);
+   RNN_COPY(tmp, mem, layer->nb_inputs*(layer->kernel_size-1));
+   RNN_COPY(&tmp[layer->nb_inputs*(layer->kernel_size-1)], input, layer->nb_inputs);
+   M = layer->nb_inputs*layer->kernel_size;
+   N = layer->nb_neurons;
+   stride = N;
+   for (i=0;i<N;i++)
+      output[i] = layer->bias[i];
+   sgemv_accum(output, layer->input_weights, N, M, stride, tmp);
+   compute_activation(output, output, N, layer->activation);
+   RNN_COPY(mem, &tmp[layer->nb_inputs], layer->nb_inputs*(layer->kernel_size-1));
+}
+
+void compute_embedding(const EmbeddingLayer *layer, float *output, int input)
+{
+   int i;
+   celt_assert(input >= 0);
+   celt_assert(input < layer->nb_inputs);
+   /*if (layer->dim == 64) printf("%d\n", input);*/
+   for (i=0;i<layer->dim;i++)
+   {
+      output[i] = layer->embedding_weights[input*layer->dim + i];
+   }    
+}
+
+void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3) {
+   int i;
+   for (i=0;i<3*N;i++) {
+      output[i] = input[i] + layer1->embedding_weights[val1*layer1->dim + i]
+                           + layer2->embedding_weights[val2*layer2->dim + i]
+                           + layer3->embedding_weights[val3*layer3->dim + i];
+   }
+}
+
+void accum_embedding(const EmbeddingLayer *layer, float *output, int input)
+{
+   int i;
+   celt_assert(input >= 0);
+   celt_assert(input < layer->nb_inputs);
+   /*if (layer->dim == 64) printf("%d\n", input);*/
+   for (i=0;i<layer->dim;i++)
+   {
+      output[i] += layer->embedding_weights[input*layer->dim + i];
+   }    
+}
diff --git a/dnn/nnet.h b/dnn/nnet.h
new file mode 100644
index 00000000..136de559
--- /dev/null
+++ b/dnn/nnet.h
@@ -0,0 +1,205 @@
+/* Copyright (c) 2018 Mozilla
+   Copyright (c) 2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _NNET_H_
+#define _NNET_H_
+
+#include "vec.h"
+#include "kiss99.h"
+
+#define ACTIVATION_LINEAR  0
+#define ACTIVATION_SIGMOID 1
+#define ACTIVATION_TANH    2
+#define ACTIVATION_RELU    3
+#define ACTIVATION_SOFTMAX 4
+#define ACTIVATION_SWISH   5
+
+#define WEIGHT_BLOB_VERSION 0
+#define WEIGHT_BLOCK_SIZE 64
+typedef struct {
+  const char *name;
+  int type;
+  int size;
+  const void *data;
+} WeightArray;
+
+#define WEIGHT_TYPE_float 0
+#define WEIGHT_TYPE_int 1
+#define WEIGHT_TYPE_qweight 2
+
+typedef struct {
+  char head[4];
+  int version;
+  int type;
+  int size;
+  int block_size;
+  char name[44];
+} WeightHead;
+
+
+typedef struct {
+  const float *bias;
+  const float *input_weights;
+  int nb_inputs;
+  int nb_neurons;
+  int activation;
+} DenseLayer;
+
+typedef struct {
+  const float *bias;
+  const float *input_weights;
+  const float *factor;
+  int nb_inputs;
+  int nb_neurons;
+  int nb_channels;
+  int activation;
+} MDenseLayer;
+
+typedef struct {
+  const float *bias;
+  const float *subias;
+  const qweight *input_weights;
+  const int *input_weights_idx;
+  const qweight *recurrent_weights;
+  int nb_inputs;
+  int nb_neurons;
+  int activation;
+  int reset_after;
+} GRULayer;
+
+typedef struct {
+  const float *bias;
+  const float *subias;
+  const float *diag_weights;
+  const qweight *recurrent_weights;
+  const int *idx;
+  int nb_neurons;
+  int activation;
+  int reset_after;
+} SparseGRULayer;
+
+typedef struct {
+  const float *bias;
+  const float *input_weights;
+  int nb_inputs;
+  int kernel_size;
+  int nb_neurons;
+  int activation;
+} Conv1DLayer;
+
+typedef struct {
+  const float *embedding_weights;
+  int nb_inputs;
+  int dim;
+} EmbeddingLayer;
+
+void compute_activation(float *output, const float *input, int N, int activation);
+
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input);
+
+void compute_mdense(const MDenseLayer *layer, float *output, const float *input);
+
+int sample_mdense(const MDenseLayer *layer,  const float *input, const float *sampling_logit_table, kiss99_ctx *rng);
+
+void compute_gru(const GRULayer *gru, float *state, const float *input);
+
+void compute_gru2(const GRULayer *gru, float *state, const float *input);
+
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input);
+
+void compute_gru3(const GRULayer *gru, float *state, const float *input);
+
+void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input);
+
+void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input);
+
+void compute_embedding(const EmbeddingLayer *layer, float *output, int input);
+
+void accum_embedding(const EmbeddingLayer *layer, float *output, int input);
+
+void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3);
+
+int sample_from_pdf(const float *pdf, int N, float exp_boost, float pdf_floor);
+
+
+extern const WeightArray lpcnet_arrays[];
+extern const WeightArray lpcnet_plc_arrays[];
+extern const WeightArray rdovae_enc_arrays[];
+extern const WeightArray rdovae_dec_arrays[];
+
+int mdense_init(MDenseLayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *input_weights,
+  const char *factor,
+  int nb_inputs,
+  int nb_neurons,
+  int nb_channels,
+  int activation);
+
+int dense_init(DenseLayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *input_weights,
+  int nb_inputs,
+  int nb_neurons,
+  int activation);
+
+int gru_init(GRULayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *subias,
+  const char *input_weights,
+  const char *input_weights_idx,
+  const char *recurrent_weights,
+  int nb_inputs,
+  int nb_neurons,
+  int activation,
+  int reset_after);
+
+int sparse_gru_init(SparseGRULayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *subias,
+  const char *diag_weights,
+  const char *recurrent_weights,
+  const char *idx,
+  int nb_neurons,
+  int activation,
+  int reset_after);
+
+int conv1d_init(Conv1DLayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *input_weights,
+  int nb_inputs,
+  int kernel_size,
+  int nb_neurons,
+  int activation);
+
+int embedding_init(EmbeddingLayer *layer, const WeightArray *arrays,
+  const char *embedding_weights,
+  int nb_inputs,
+  int dim);
+
+
+#endif /* _MLP_H_ */
diff --git a/dnn/opus_types.h b/dnn/opus_types.h
new file mode 100644
index 00000000..71808266
--- /dev/null
+++ b/dnn/opus_types.h
@@ -0,0 +1,159 @@
+/* (C) COPYRIGHT 1994-2002 Xiph.Org Foundation */
+/* Modified by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* opus_types.h based on ogg_types.h from libogg */
+
+/**
+   @file opus_types.h
+   @brief Opus reference implementation types
+*/
+#ifndef OPUS_TYPES_H
+#define OPUS_TYPES_H
+
+/* Use the real stdint.h if it's there (taken from Paul Hsieh's pstdint.h) */
+#if (defined(__STDC__) && __STDC__ && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_)) || defined (HAVE_STDINT_H))
+#include <stdint.h>
+
+   typedef int16_t opus_int16;
+   typedef uint16_t opus_uint16;
+   typedef int32_t opus_int32;
+   typedef uint32_t opus_uint32;
+#elif defined(_WIN32)
+
+#  if defined(__CYGWIN__)
+#    include <_G_config.h>
+     typedef _G_int32_t opus_int32;
+     typedef _G_uint32_t opus_uint32;
+     typedef _G_int16 opus_int16;
+     typedef _G_uint16 opus_uint16;
+#  elif defined(__MINGW32__)
+     typedef short opus_int16;
+     typedef unsigned short opus_uint16;
+     typedef int opus_int32;
+     typedef unsigned int opus_uint32;
+#  elif defined(__MWERKS__)
+     typedef int opus_int32;
+     typedef unsigned int opus_uint32;
+     typedef short opus_int16;
+     typedef unsigned short opus_uint16;
+#  else
+     /* MSVC/Borland */
+     typedef __int32 opus_int32;
+     typedef unsigned __int32 opus_uint32;
+     typedef __int16 opus_int16;
+     typedef unsigned __int16 opus_uint16;
+#  endif
+
+#elif defined(__MACOS__)
+
+#  include <sys/types.h>
+   typedef SInt16 opus_int16;
+   typedef UInt16 opus_uint16;
+   typedef SInt32 opus_int32;
+   typedef UInt32 opus_uint32;
+
+#elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */
+
+#  include <sys/types.h>
+   typedef int16_t opus_int16;
+   typedef u_int16_t opus_uint16;
+   typedef int32_t opus_int32;
+   typedef u_int32_t opus_uint32;
+
+#elif defined(__BEOS__)
+
+   /* Be */
+#  include <inttypes.h>
+   typedef int16 opus_int16;
+   typedef u_int16 opus_uint16;
+   typedef int32_t opus_int32;
+   typedef u_int32_t opus_uint32;
+
+#elif defined (__EMX__)
+
+   /* OS/2 GCC */
+   typedef short opus_int16;
+   typedef unsigned short opus_uint16;
+   typedef int opus_int32;
+   typedef unsigned int opus_uint32;
+
+#elif defined (DJGPP)
+
+   /* DJGPP */
+   typedef short opus_int16;
+   typedef unsigned short opus_uint16;
+   typedef int opus_int32;
+   typedef unsigned int opus_uint32;
+
+#elif defined(R5900)
+
+   /* PS2 EE */
+   typedef int opus_int32;
+   typedef unsigned opus_uint32;
+   typedef short opus_int16;
+   typedef unsigned short opus_uint16;
+
+#elif defined(__SYMBIAN32__)
+
+   /* Symbian GCC */
+   typedef signed short opus_int16;
+   typedef unsigned short opus_uint16;
+   typedef signed int opus_int32;
+   typedef unsigned int opus_uint32;
+
+#elif defined(CONFIG_TI_C54X) || defined (CONFIG_TI_C55X)
+
+   typedef short opus_int16;
+   typedef unsigned short opus_uint16;
+   typedef long opus_int32;
+   typedef unsigned long opus_uint32;
+
+#elif defined(CONFIG_TI_C6X)
+
+   typedef short opus_int16;
+   typedef unsigned short opus_uint16;
+   typedef int opus_int32;
+   typedef unsigned int opus_uint32;
+
+#else
+
+   /* Give up, take a reasonable guess */
+   typedef short opus_int16;
+   typedef unsigned short opus_uint16;
+   typedef int opus_int32;
+   typedef unsigned int opus_uint32;
+
+#endif
+
+#define opus_int         int                     /* used for counters etc; at least 16 bits */
+#define opus_int64       long long
+#define opus_int8        signed char
+
+#define opus_uint        unsigned int            /* used for counters etc; at least 16 bits */
+#define opus_uint64      unsigned long long
+#define opus_uint8       unsigned char
+
+#endif  /* OPUS_TYPES_H */
diff --git a/dnn/parse_lpcnet_weights.c b/dnn/parse_lpcnet_weights.c
new file mode 100644
index 00000000..c830a13c
--- /dev/null
+++ b/dnn/parse_lpcnet_weights.c
@@ -0,0 +1,257 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+#include "nnet.h"
+
+#define SPARSE_BLOCK_SIZE 32
+
+int parse_record(const unsigned char **data, int *len, WeightArray *array) {
+  WeightHead *h = (WeightHead *)*data;
+  if (*len < WEIGHT_BLOCK_SIZE) return -1;
+  if (h->block_size < h->size) return -1;
+  if (h->block_size > *len-WEIGHT_BLOCK_SIZE) return -1;
+  if (h->name[sizeof(h->name)-1] != 0) return -1;
+  if (h->size < 0) return -1;
+  array->name = h->name;
+  array->type = h->type;
+  array->size = h->size;
+  array->data = (*data)+WEIGHT_BLOCK_SIZE;
+  
+  *data += h->block_size+WEIGHT_BLOCK_SIZE;
+  *len -= h->block_size+WEIGHT_BLOCK_SIZE;
+  return array->size;
+}
+
+int parse_weights(WeightArray **list, const unsigned char *data, int len)
+{
+  int nb_arrays=0;
+  int capacity=20;
+  *list = malloc(capacity*sizeof(WeightArray));
+  while (len > 0) {
+    int ret;
+    WeightArray array = {NULL, 0, 0, 0};
+    ret = parse_record(&data, &len, &array);
+    if (ret > 0) {
+      if (nb_arrays+1 >= capacity) {
+        /* Make sure there's room for the ending NULL element too. */
+        capacity = capacity*3/2;
+        *list = realloc(*list, capacity*sizeof(WeightArray));
+      }
+      (*list)[nb_arrays++] = array;
+    } else {
+      free(*list);
+      *list = NULL;
+      return -1;
+    }
+  }
+  (*list)[nb_arrays].name=NULL;
+  return nb_arrays;
+}
+
+static const void *find_array_entry(const WeightArray *arrays, const char *name) {
+  while (arrays->name && strcmp(arrays->name, name) != 0) arrays++;
+  return arrays;
+}
+
+static const void *find_array_check(const WeightArray *arrays, const char *name, int size) {
+  const WeightArray *a = find_array_entry(arrays, name);
+  if (a && a->size == size) return a->data;
+  else return NULL;
+}
+
+static const void *find_idx_check(const WeightArray *arrays, const char *name, int nb_in, int nb_out, int *total_blocks) {
+  int remain;
+  const int *idx;
+  const WeightArray *a = find_array_entry(arrays, name);
+  *total_blocks = 0;
+  if (a == NULL) return NULL;
+  idx = a->data;
+  remain = a->size/sizeof(int);
+  while (remain > 0) {
+    int nb_blocks;
+    int i;
+    nb_blocks = *idx++;
+    if (remain < nb_blocks+1) return NULL;
+    for (i=0;i<nb_blocks;i++) {
+      int pos = *idx++;
+      if (pos+3 >= nb_in || (pos&0x3)) return NULL; 
+    }
+    nb_out -= 8;
+    remain -= nb_blocks+1;
+    *total_blocks += nb_blocks;
+  }
+  if (nb_out != 0) return NULL;
+  return a->data;
+}
+
+int mdense_init(MDenseLayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *input_weights,
+  const char *factor,
+  int nb_inputs,
+  int nb_neurons,
+  int nb_channels,
+  int activation)
+{
+  if ((layer->bias = find_array_check(arrays, bias, nb_neurons*nb_channels*sizeof(layer->bias[0]))) == NULL) return 1;
+  if ((layer->input_weights = find_array_check(arrays, input_weights, nb_inputs*nb_channels*nb_neurons*sizeof(layer->input_weights[0]))) == NULL) return 1;
+  if ((layer->factor = find_array_check(arrays, factor, nb_channels*nb_neurons*sizeof(layer->factor[0]))) == NULL) return 1;
+  layer->nb_inputs = nb_inputs;
+  layer->nb_neurons = nb_neurons;
+  layer->nb_channels = nb_channels;
+  layer->activation = activation;
+  return 0;
+}
+
+int dense_init(DenseLayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *input_weights,
+  int nb_inputs,
+  int nb_neurons,
+  int activation)
+{
+  if ((layer->bias = find_array_check(arrays, bias, nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+  if ((layer->input_weights = find_array_check(arrays, input_weights, nb_inputs*nb_neurons*sizeof(layer->input_weights[0]))) == NULL) return 1;
+  layer->nb_inputs = nb_inputs;
+  layer->nb_neurons = nb_neurons;
+  layer->activation = activation;
+  return 0;
+}
+
+int gru_init(GRULayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *subias,
+  const char *input_weights,
+  const char *input_weights_idx,
+  const char *recurrent_weights,
+  int nb_inputs,
+  int nb_neurons,
+  int activation,
+  int reset_after)
+{
+  int total_blocks;
+  if ((layer->bias = find_array_check(arrays, bias, 6*nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+  if ((layer->subias = find_array_check(arrays, subias, 6*nb_neurons*sizeof(layer->subias[0]))) == NULL) return 1;
+  if ((layer->input_weights_idx = find_idx_check(arrays, input_weights_idx, nb_inputs, 3*nb_neurons, &total_blocks)) == NULL) return 1;
+  if ((layer->input_weights = find_array_check(arrays, input_weights, SPARSE_BLOCK_SIZE*total_blocks*sizeof(layer->input_weights[0]))) == NULL) return 1;
+  if ((layer->recurrent_weights = find_array_check(arrays, recurrent_weights, 3*nb_neurons*nb_neurons*sizeof(layer->recurrent_weights[0]))) == NULL) return 1;
+  layer->nb_inputs = nb_inputs;
+  layer->nb_neurons = nb_neurons;
+  layer->activation = activation;
+  layer->reset_after = reset_after;
+  return 0;
+}
+
+int sparse_gru_init(SparseGRULayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *subias,
+  const char *diag_weights,
+  const char *recurrent_weights,
+  const char *idx,
+  int nb_neurons,
+  int activation,
+  int reset_after)
+{
+  int total_blocks;
+  if ((layer->bias = find_array_check(arrays, bias, 6*nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+  if ((layer->subias = find_array_check(arrays, subias, 6*nb_neurons*sizeof(layer->subias[0]))) == NULL) return 1;
+  if ((layer->diag_weights = find_array_check(arrays, diag_weights, 3*nb_neurons*sizeof(layer->diag_weights[0]))) == NULL) return 1;
+  if ((layer->idx = find_idx_check(arrays, idx, nb_neurons, 3*nb_neurons, &total_blocks)) == NULL) return 1;
+  if ((layer->recurrent_weights = find_array_check(arrays, recurrent_weights, SPARSE_BLOCK_SIZE*total_blocks*sizeof(layer->recurrent_weights[0]))) == NULL) return 1;
+  layer->nb_neurons = nb_neurons;
+  layer->activation = activation;
+  layer->reset_after = reset_after;
+  return 0;
+}
+
+int conv1d_init(Conv1DLayer *layer, const WeightArray *arrays,
+  const char *bias,
+  const char *input_weights,
+  int nb_inputs,
+  int kernel_size,
+  int nb_neurons,
+  int activation)
+{
+  if ((layer->bias = find_array_check(arrays, bias, nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+  if ((layer->input_weights = find_array_check(arrays, input_weights, kernel_size*nb_inputs*nb_neurons*sizeof(layer->input_weights[0]))) == NULL) return 1;
+  layer->nb_inputs = nb_inputs;
+  layer->kernel_size = kernel_size;
+  layer->nb_neurons = nb_neurons;
+  layer->activation = activation;
+  return 0;
+}
+
+int embedding_init(EmbeddingLayer *layer, const WeightArray *arrays,
+  const char *embedding_weights,
+  int nb_inputs,
+  int dim)
+{
+  if ((layer->embedding_weights = find_array_check(arrays, embedding_weights, nb_inputs*dim*sizeof(layer->embedding_weights[0]))) == NULL) return 1;
+  layer->nb_inputs = nb_inputs;
+  layer->dim = dim;
+  return 0;
+}
+
+
+
+#if 0
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <stdio.h>
+
+int main()
+{
+  int fd;
+  unsigned char *data;
+  int len;
+  int nb_arrays;
+  int i;
+  WeightArray *list;
+  struct stat st;
+  const char *filename = "weights_blob.bin";
+  stat(filename, &st);
+  len = st.st_size;
+  fd = open(filename, O_RDONLY);
+  data = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+  printf("size is %d\n", len);
+  nb_arrays = parse_weights(&list, data, len);
+  for (i=0;i<nb_arrays;i++) {
+    printf("found %s: size %d\n", list[i].name, list[i].size);
+  }
+  printf("%p\n", list[i].name);
+  free(list);
+  munmap(data, len);
+  close(fd);
+  return 0;
+}
+#endif
diff --git a/dnn/pitch.c b/dnn/pitch.c
new file mode 100644
index 00000000..0fd50233
--- /dev/null
+++ b/dnn/pitch.c
@@ -0,0 +1,84 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.c
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pitch.h"
+#include "common.h"
+#include "math.h"
+
+
+
+void celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch)
+{
+
+#if 0 /* This is a simple version of the pitch correlation that should work
+         well on DSPs like Blackfin and TI C5x/C6x */
+   int i, j;
+   for (i=0;i<max_pitch;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<len;j++)
+         sum = MAC16_16(sum, _x[j], _y[i+j]);
+      xcorr[i] = sum;
+   }
+
+#else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
+   int i;
+   /*The EDSP version requires that max_pitch is at least 1, and that _x is
+      32-bit aligned.
+     Since it's hard to put asserts in assembly, put them here.*/
+   celt_assert(max_pitch>0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   for (i=0;i<max_pitch-3;i+=4)
+   {
+      opus_val32 sum[4]={0,0,0,0};
+      xcorr_kernel(_x, _y+i, sum, len);
+      xcorr[i]=sum[0];
+      xcorr[i+1]=sum[1];
+      xcorr[i+2]=sum[2];
+      xcorr[i+3]=sum[3];
+   }
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
+   for (;i<max_pitch;i++)
+   {
+      opus_val32 sum;
+      sum = celt_inner_prod(_x, _y+i, len);
+      xcorr[i] = sum;
+   }
+#endif
+}
+
diff --git a/dnn/pitch.h b/dnn/pitch.h
new file mode 100644
index 00000000..2784d537
--- /dev/null
+++ b/dnn/pitch.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_H
+#define PITCH_H
+
+#include "arch.h"
+
+/* OPT: This is the kernel you really want to optimize. It gets used a lot
+   by the prefilter and by the PLC. */
+static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   opus_val16 y_0, y_1, y_2, y_3;
+   celt_assert(len>=3);
+   y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+   y_0=*y++;
+   y_1=*y++;
+   y_2=*y++;
+   for (j=0;j<len-3;j+=4)
+   {
+      opus_val16 tmp;
+      tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+      tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+      tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+      tmp=*x++;
+      y_2=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_3);
+      sum[1] = MAC16_16(sum[1],tmp,y_0);
+      sum[2] = MAC16_16(sum[2],tmp,y_1);
+      sum[3] = MAC16_16(sum[3],tmp,y_2);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+   }
+   if (j<len)
+   {
+      opus_val16 tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+   }
+}
+
+/*We make sure a C version is always available for cases where the overhead of
+  vectorization and passing around an arch flag aren't worth it.*/
+static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x,
+      const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy=0;
+   for (i=0;i<N;i++)
+      xy = MAC16_16(xy, x[i], y[i]);
+   return xy;
+}
+
+void celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch);
+
+#endif
diff --git a/dnn/tansig_table.h b/dnn/tansig_table.h
new file mode 100644
index 00000000..ebec7e3a
--- /dev/null
+++ b/dnn/tansig_table.h
@@ -0,0 +1,50 @@
+/* This file is auto-generated by gen_tables */
+
+#ifndef TANSIG_TABLE_H
+#define TANSIG_TABLE_H
+
+static const float tansig_table[201] = {
+0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
+0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
+0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
+0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
+0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
+0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
+0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
+0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
+0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
+0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
+0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
+0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
+0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
+0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
+0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
+0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
+0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
+0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
+0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
+0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
+0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
+0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
+0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
+0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
+0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
+0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
+0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
+0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
+0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
+0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
+0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
+0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
+0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
+0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
+0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
+0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
+0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
+0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
+1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+1.000000f,
+};
+
+#endif /*TANSIG_TABLE_H*/
diff --git a/dnn/test_lpcnet.c b/dnn/test_lpcnet.c
new file mode 100644
index 00000000..01917993
--- /dev/null
+++ b/dnn/test_lpcnet.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 Mozilla */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include "arch.h"
+#include "lpcnet.h"
+#include "freq.h"
+
+
+int main(int argc, char **argv) {
+    FILE *fin, *fout;
+    LPCNetState *net;
+    net = lpcnet_create();
+    if (argc != 3)
+    {
+        fprintf(stderr, "usage: test_lpcnet <features.f32> <output.pcm>\n");
+        return 0;
+    }
+    fin = fopen(argv[1], "rb");
+    if (fin == NULL) {
+	fprintf(stderr, "Can't open %s\n", argv[1]);
+	exit(1);
+    }
+
+    fout = fopen(argv[2], "wb");
+    if (fout == NULL) {
+	fprintf(stderr, "Can't open %s\n", argv[2]);
+	exit(1);
+    }
+
+    while (1) {
+        float in_features[NB_TOTAL_FEATURES];
+        float features[NB_FEATURES];
+        short pcm[FRAME_SIZE];
+        fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
+        if (feof(fin)) break;
+        RNN_COPY(features, in_features, NB_FEATURES);
+        lpcnet_synthesize(net, features, pcm, FRAME_SIZE);
+        fwrite(pcm, sizeof(pcm[0]), FRAME_SIZE, fout);
+    }
+    fclose(fin);
+    fclose(fout);
+    lpcnet_destroy(net);
+    return 0;
+}
diff --git a/dnn/test_vec.c b/dnn/test_vec.c
new file mode 100644
index 00000000..1fdc7cb4
--- /dev/null
+++ b/dnn/test_vec.c
@@ -0,0 +1,130 @@
+#include <stdio.h>
+#include <math.h>
+#include "opus_types.h"
+#include "arch.h"
+#include "common.h"
+#include "tansig_table.h"
+
+#define LPCNET_TEST
+
+// we need to call two versions of each functions that have the same
+// name, so use #defines to temp rename them
+
+#define lpcnet_exp2 lpcnet_exp2_fast
+#define tansig_approx tansig_approx_fast
+#define sigmoid_approx sigmoid_approx_fast
+#define softmax softmax_fast
+#define vec_tanh vec_tanh_fast
+#define vec_sigmoid vec_sigmoid_fast
+#define sgemv_accum16 sgemv_accum16_fast
+#define sparse_sgemv_accum16 sparse_sgemv_accum16_fast
+
+#ifdef __AVX__
+#include "vec_avx.h"
+#ifdef __AVX2__
+const char simd[]="AVX2";
+#else
+const char simd[]="AVX";
+#endif
+#elif __ARM_NEON__
+#include "vec_neon.h"
+const char simd[]="NEON";
+#else
+const char simd[]="none";
+
+#endif
+
+#undef lpcnet_exp2
+#undef tansig_approx
+#undef sigmoid_approx
+#undef softmax
+#undef vec_tanh
+#undef vec_sigmoid
+#undef sgemv_accum16
+#undef sparse_sgemv_accum16
+#include "vec.h"
+
+#define ROW_STEP 16
+#define ROWS     ROW_STEP*10
+#define COLS     2
+#define ENTRIES  2
+
+int test_sgemv_accum16() {
+    float weights[ROWS*COLS];
+    float x[COLS];
+    float out[ROWS], out_fast[ROWS];
+    int i;
+
+    printf("sgemv_accum16.....................: ");
+    for(i=0; i<ROWS*COLS; i++) {
+	weights[i] = i;
+    }
+    for(i=0; i<ROWS; i++) {
+	out[i] = 0;
+	out_fast[i] = 0;
+    }
+  
+    for(i=0; i<COLS; i++) {
+	x[i] = i+1;
+    }
+
+    sgemv_accum16(out, weights, ROWS, COLS, 1, x);
+    sgemv_accum16_fast(out_fast, weights, ROWS, COLS, 1, x);
+
+    for(i=0; i<ROWS; i++) {
+	if (out[i] != out_fast[i]) {
+	    printf("fail\n");
+	    for(i=0; i<ROWS; i++) {
+		printf("%d %f %f\n", i, out[i], out_fast[i]);
+		if (out[i] != out_fast[i])
+		    return 1;
+	    }
+	}
+    }
+
+    printf("pass\n");
+    return 0;
+}
+
+
+int test_sparse_sgemv_accum16() {
+    int rows = ROW_STEP*ENTRIES;
+    int indx[] = {1,0,2,0,1};
+    float w[ROW_STEP*(1+2)];
+    float x[ENTRIES] = {1,2};
+    float out[ROW_STEP*(1+2)], out_fast[ROW_STEP*(1+2)];
+    int i;
+
+    printf("sparse_sgemv_accum16..............: ");
+    for(i=0; i<ROW_STEP*(1+2); i++) {
+	w[i] = i;
+	out[i] = 0;
+	out_fast[i] = 0;
+    }
+  
+    sparse_sgemv_accum16(out, w, rows, indx, x);
+    sparse_sgemv_accum16_fast(out_fast, w, rows, indx, x);
+
+    for(i=0; i<ROW_STEP*ENTRIES; i++) {
+	if (out[i] != out_fast[i]) {
+	    printf("fail\n");
+	    for(i=0; i<ROW_STEP*ENTRIES; i++) {
+		printf("%d %f %f\n", i, out[i], out_fast[i]);
+		if (out[i] != out_fast[i])
+		    return 1;
+	    }
+	}
+    }
+
+    printf("pass\n");
+    return 0;
+}
+
+int main() {
+    printf("testing vector routines on SIMD: %s\n", simd);
+    int test1 = test_sgemv_accum16();
+    int test2 = test_sparse_sgemv_accum16();
+    return test1 || test2;
+}
+
+  
diff --git a/dnn/torch/rdovae/README.md b/dnn/torch/rdovae/README.md
new file mode 100644
index 00000000..14359d82
--- /dev/null
+++ b/dnn/torch/rdovae/README.md
@@ -0,0 +1,24 @@
+# Rate-Distortion-Optimized Variational Auto-Encoder
+
+## Setup
+The python code requires python >= 3.6 and has been tested with python 3.6 and python 3.10. To install requirements run
+```
+python -m pip install -r requirements.txt
+```
+
+## Training
+To generate training data use dump date from the main LPCNet repo
+```
+./dump_data -train 16khz_speech_input.s16 features.f32 data.s16
+```
+
+To train the model, simply run
+```
+python train_rdovae.py features.f32 output_folder
+```
+
+To train on CUDA device add `--cuda-visible-devices idx`.
+
+
+## ToDo
+- Upload checkpoints and add URLs
diff --git a/dnn/torch/rdovae/export_rdovae_weights.py b/dnn/torch/rdovae/export_rdovae_weights.py
new file mode 100644
index 00000000..fdef5f63
--- /dev/null
+++ b/dnn/torch/rdovae/export_rdovae_weights.py
@@ -0,0 +1,258 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import argparse
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint')
+parser.add_argument('output_dir', type=str, help='output folder')
+parser.add_argument('--format', choices=['C', 'numpy'], help='output format, default: C', default='C')
+
+args = parser.parse_args()
+
+import torch
+import numpy as np
+
+from rdovae import RDOVAE
+from wexchange.torch import dump_torch_weights
+from wexchange.c_export import CWriter, print_vector
+
+
+def dump_statistical_model(writer, qembedding):
+    w = qembedding.weight.detach()
+    levels, dim = w.shape
+    N = dim // 6
+
+    print("printing statistical model")
+    quant_scales    = torch.nn.functional.softplus(w[:, : N]).numpy()
+    dead_zone       = 0.05 * torch.nn.functional.softplus(w[:, N : 2 * N]).numpy()
+    r               = torch.sigmoid(w[:, 5 * N : 6 * N]).numpy()
+    p0              = torch.sigmoid(w[:, 4 * N : 5 * N]).numpy()
+    p0              = 1 - r ** (0.5 + 0.5 * p0)
+
+    quant_scales_q8 = np.round(quant_scales * 2**8).astype(np.uint16)
+    dead_zone_q10   = np.round(dead_zone * 2**10).astype(np.uint16)
+    r_q15           = np.round(r * 2**15).astype(np.uint16)
+    p0_q15          = np.round(p0 * 2**15).astype(np.uint16)
+
+    print_vector(writer.source, quant_scales_q8, 'dred_quant_scales_q8', dtype='opus_uint16', static=False)
+    print_vector(writer.source, dead_zone_q10, 'dred_dead_zone_q10', dtype='opus_uint16', static=False)
+    print_vector(writer.source, r_q15, 'dred_r_q15', dtype='opus_uint16', static=False)
+    print_vector(writer.source, p0_q15, 'dred_p0_q15', dtype='opus_uint16', static=False)
+
+    writer.header.write(
+f"""
+extern const opus_uint16 dred_quant_scales_q8[{levels * N}];
+extern const opus_uint16 dred_dead_zone_q10[{levels * N}];
+extern const opus_uint16 dred_r_q15[{levels * N}];
+extern const opus_uint16 dred_p0_q15[{levels * N}];
+
+"""
+    )
+
+
+def c_export(args, model):
+    
+    message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
+    
+    enc_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_enc_data"), message=message)
+    dec_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_dec_data"), message=message)
+    stats_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_stats_data"), message=message)
+    constants_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_constants"), message=message, header_only=True)
+    
+    # some custom includes
+    for writer in [enc_writer, dec_writer, stats_writer]:
+        writer.header.write(
+f"""
+#include "opus_types.h"
+
+#include "dred_rdovae_constants.h"
+
+#include "nnet.h"
+"""
+        )
+        
+    # encoder
+    encoder_dense_layers = [
+        ('core_encoder.module.dense_1'       , 'enc_dense1',   'TANH'), 
+        ('core_encoder.module.dense_2'       , 'enc_dense3',   'TANH'),
+        ('core_encoder.module.dense_3'       , 'enc_dense5',   'TANH'),
+        ('core_encoder.module.dense_4'       , 'enc_dense7',   'TANH'),
+        ('core_encoder.module.dense_5'       , 'enc_dense8',   'TANH'),
+        ('core_encoder.module.state_dense_1' , 'gdense1'    ,   'TANH'),
+        ('core_encoder.module.state_dense_2' , 'gdense2'    ,   'TANH')
+    ]
+    
+    for name, export_name, activation in encoder_dense_layers:
+        layer = model.get_submodule(name)
+        dump_torch_weights(enc_writer, layer, name=export_name, activation=activation, verbose=True)
+  
+  
+    encoder_gru_layers = [    
+        ('core_encoder.module.gru_1'         , 'enc_dense2',   'TANH'),
+        ('core_encoder.module.gru_2'         , 'enc_dense4',   'TANH'),
+        ('core_encoder.module.gru_3'         , 'enc_dense6',   'TANH')
+    ]
+ 
+    enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
+                             for name, export_name, activation in encoder_gru_layers])
+ 
+    
+    encoder_conv_layers = [   
+        ('core_encoder.module.conv1'         , 'bits_dense' ,   'LINEAR') 
+    ]
+    
+    enc_max_conv_inputs = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True) for name, export_name, activation in encoder_conv_layers])    
+
+    
+    del enc_writer
+    
+    # decoder
+    decoder_dense_layers = [
+        ('core_decoder.module.gru_1_init'    , 'state1',        'TANH'),
+        ('core_decoder.module.gru_2_init'    , 'state2',        'TANH'),
+        ('core_decoder.module.gru_3_init'    , 'state3',        'TANH'),
+        ('core_decoder.module.dense_1'       , 'dec_dense1',    'TANH'),
+        ('core_decoder.module.dense_2'       , 'dec_dense3',    'TANH'),
+        ('core_decoder.module.dense_3'       , 'dec_dense5',    'TANH'),
+        ('core_decoder.module.dense_4'       , 'dec_dense7',    'TANH'),
+        ('core_decoder.module.dense_5'       , 'dec_dense8',    'TANH'),
+        ('core_decoder.module.output'        , 'dec_final',     'LINEAR')
+    ]
+
+    for name, export_name, activation in decoder_dense_layers:
+        layer = model.get_submodule(name)
+        dump_torch_weights(dec_writer, layer, name=export_name, activation=activation, verbose=True)
+        
+
+    decoder_gru_layers = [
+        ('core_decoder.module.gru_1'         , 'dec_dense2',    'TANH'),
+        ('core_decoder.module.gru_2'         , 'dec_dense4',    'TANH'),
+        ('core_decoder.module.gru_3'         , 'dec_dense6',    'TANH')
+    ]
+    
+    dec_max_rnn_units = max([dump_torch_weights(dec_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
+                             for name, export_name, activation in decoder_gru_layers])
+        
+    del dec_writer
+    
+    # statistical model
+    qembedding = model.statistical_model.quant_embedding
+    dump_statistical_model(stats_writer, qembedding)
+    
+    del stats_writer
+    
+    # constants
+    constants_writer.header.write(
+f"""
+#define DRED_NUM_FEATURES {model.feature_dim}
+
+#define DRED_LATENT_DIM {model.latent_dim}
+
+#define DRED_STATE_DIME {model.state_dim}
+
+#define DRED_NUM_QUANTIZATION_LEVELS {model.quant_levels}
+
+#define DRED_MAX_RNN_NEURONS {max(enc_max_rnn_units, dec_max_rnn_units)}
+
+#define DRED_MAX_CONV_INPUTS {enc_max_conv_inputs}
+
+#define DRED_ENC_MAX_RNN_NEURONS {enc_max_conv_inputs}
+
+#define DRED_ENC_MAX_CONV_INPUTS {enc_max_conv_inputs}
+
+#define DRED_DEC_MAX_RNN_NEURONS {dec_max_rnn_units}
+
+"""
+    )
+    
+    del constants_writer
+
+
+def numpy_export(args, model):
+    
+    exchange_name_to_name = {
+        'encoder_stack_layer1_dense'    : 'core_encoder.module.dense_1',
+        'encoder_stack_layer3_dense'    : 'core_encoder.module.dense_2',
+        'encoder_stack_layer5_dense'    : 'core_encoder.module.dense_3',
+        'encoder_stack_layer7_dense'    : 'core_encoder.module.dense_4',
+        'encoder_stack_layer8_dense'    : 'core_encoder.module.dense_5',
+        'encoder_state_layer1_dense'    : 'core_encoder.module.state_dense_1',
+        'encoder_state_layer2_dense'    : 'core_encoder.module.state_dense_2',
+        'encoder_stack_layer2_gru'      : 'core_encoder.module.gru_1',
+        'encoder_stack_layer4_gru'      : 'core_encoder.module.gru_2',
+        'encoder_stack_layer6_gru'      : 'core_encoder.module.gru_3',
+        'encoder_stack_layer9_conv'     : 'core_encoder.module.conv1',
+        'statistical_model_embedding'   : 'statistical_model.quant_embedding',
+        'decoder_state1_dense'          : 'core_decoder.module.gru_1_init',
+        'decoder_state2_dense'          : 'core_decoder.module.gru_2_init',
+        'decoder_state3_dense'          : 'core_decoder.module.gru_3_init',
+        'decoder_stack_layer1_dense'    : 'core_decoder.module.dense_1',
+        'decoder_stack_layer3_dense'    : 'core_decoder.module.dense_2',
+        'decoder_stack_layer5_dense'    : 'core_decoder.module.dense_3',
+        'decoder_stack_layer7_dense'    : 'core_decoder.module.dense_4',
+        'decoder_stack_layer8_dense'    : 'core_decoder.module.dense_5',
+        'decoder_stack_layer9_dense'    : 'core_decoder.module.output',
+        'decoder_stack_layer2_gru'      : 'core_decoder.module.gru_1',
+        'decoder_stack_layer4_gru'      : 'core_decoder.module.gru_2',
+        'decoder_stack_layer6_gru'      : 'core_decoder.module.gru_3'
+    }
+    
+    name_to_exchange_name = {value : key for key, value in exchange_name_to_name.items()}
+    
+    for name, exchange_name in name_to_exchange_name.items():
+        print(f"printing layer {name}...")
+        dump_torch_weights(os.path.join(args.output_dir, exchange_name), model.get_submodule(name))
+
+
+if __name__ == "__main__":
+    
+    
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    
+    # load model from checkpoint
+    checkpoint = torch.load(args.checkpoint, map_location='cpu')
+    model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+    missing_keys, unmatched_keys = model.load_state_dict(checkpoint['state_dict'], strict=False)
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"error: missing keys in state dict")
+
+    if len(unmatched_keys) > 0:
+        print(f"warning: the following keys were unmatched {unmatched_keys}")
+    
+    if args.format == 'C':
+        c_export(args, model)
+    elif args.format == 'numpy':
+        numpy_export(args, model)
+    else:
+        raise ValueError(f'error: unknown export format {args.format}')
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/fec_encoder.py b/dnn/torch/rdovae/fec_encoder.py
new file mode 100644
index 00000000..291c0628
--- /dev/null
+++ b/dnn/torch/rdovae/fec_encoder.py
@@ -0,0 +1,213 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe and Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import subprocess
+import argparse
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser(description='Encode redundancy for Opus neural FEC. Designed for use with voip application and 20ms frames')
+
+parser.add_argument('input', metavar='<input signal>', help='audio input (.wav or .raw or .pcm as int16)')
+parser.add_argument('checkpoint', metavar='<weights>', help='model checkpoint')
+parser.add_argument('q0', metavar='<quant level 0>', type=int, help='quantization level for most recent frame')
+parser.add_argument('q1', metavar='<quant level 1>', type=int, help='quantization level for oldest frame')
+parser.add_argument('output', type=str, help='output file (will be extended with .fec)')
+
+parser.add_argument('--dump-data', type=str, default='./dump_data', help='path to dump data executable (default ./dump_data)')
+parser.add_argument('--num-redundancy-frames', default=52, type=int, help='number of redundancy frames per packet (default 52)')
+parser.add_argument('--extra-delay', default=0, type=int, help="last features in packet are calculated with the decoder aligned samples, use this option to add extra delay (in samples at 16kHz)")
+parser.add_argument('--lossfile', type=str, help='file containing loss trace (0 for frame received, 1 for lost)')
+parser.add_argument('--debug-output', action='store_true', help='if set, differently assembled features are written to disk')
+
+args = parser.parse_args()
+
+import numpy as np
+from scipy.io import wavfile
+import torch
+
+from rdovae import RDOVAE
+from packets import write_fec_packets
+
+torch.set_num_threads(4)
+
+checkpoint = torch.load(args.checkpoint, map_location="cpu")
+model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+model.load_state_dict(checkpoint['state_dict'], strict=False)
+model.to("cpu")
+
+lpc_order = 16
+
+## prepare input signal
+# SILK frame size is 20ms and LPCNet subframes are 10ms
+subframe_size = 160
+frame_size = 2 * subframe_size
+
+# 91 samples delay to align with SILK decoded frames
+silk_delay = 91
+
+# prepend zeros to have enough history to produce the first package
+zero_history = (args.num_redundancy_frames - 1) * frame_size
+
+# dump data has a (feature) delay of 10ms
+dump_data_delay = 160
+
+total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
+
+# load signal
+if args.input.endswith('.raw') or args.input.endswith('.pcm'):
+    signal = np.fromfile(args.input, dtype='int16')
+    
+elif args.input.endswith('.wav'):
+    fs, signal = wavfile.read(args.input)
+else:
+    raise ValueError(f'unknown input signal format: {args.input}')
+
+# fill up last frame with zeros
+padded_signal_length = len(signal) + total_delay
+tail = padded_signal_length % frame_size
+right_padding = (frame_size - tail) % frame_size
+    
+signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
+
+padded_signal_file  = os.path.splitext(args.input)[0] + '_padded.raw'
+signal.tofile(padded_signal_file)
+
+# write signal and call dump_data to create features
+
+feature_file = os.path.splitext(args.input)[0] + '_features.f32'
+command = f"{args.dump_data} -test {padded_signal_file} {feature_file}"
+r = subprocess.run(command, shell=True)
+if r.returncode != 0:
+    raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")
+
+# load features
+nb_features = model.feature_dim + lpc_order
+nb_used_features = model.feature_dim
+
+# load features
+features = np.fromfile(feature_file, dtype='float32')
+num_subframes = len(features) // nb_features
+num_subframes = 2 * (num_subframes // 2)
+num_frames = num_subframes // 2
+
+features = np.reshape(features, (1, -1, nb_features))
+features = features[:, :, :nb_used_features]
+features = features[:, :num_subframes, :]
+
+# quant_ids in reverse decoding order
+quant_ids = torch.round((args.q1 + (args.q0 - args.q1) * torch.arange(args.num_redundancy_frames // 2) / (args.num_redundancy_frames // 2 - 1))).long()
+
+print(f"using quantization levels {quant_ids}...")
+
+# convert input to torch tensors
+features = torch.from_numpy(features)
+
+
+# run encoder
+print("running fec encoder...")
+with torch.no_grad():
+
+    # encoding
+    z, states, state_size = model.encode(features)
+
+
+    # decoder on packet chunks
+    input_length = args.num_redundancy_frames // 2
+    offset = args.num_redundancy_frames - 1
+
+    packets = []
+    packet_sizes = []
+
+    for i in range(offset, num_frames):
+        print(f"processing frame {i - offset}...")
+        # quantize / unquantize latent vectors
+        zi = torch.clone(z[:, i - 2 * input_length + 2: i + 1 : 2, :])
+        zi, rates = model.quantize(zi, quant_ids)
+        zi = model.unquantize(zi, quant_ids)
+        
+        features = model.decode(zi, states[:, i : i + 1, :])
+        packets.append(features.squeeze(0).numpy())
+        packet_size = 8 * int((torch.sum(rates) + 7 + state_size) / 8)
+        packet_sizes.append(packet_size)
+
+
+# write packets
+packet_file = args.output + '.fec' if not args.output.endswith('.fec') else args.output
+write_fec_packets(packet_file, packets, packet_sizes)
+
+
+print(f"average redundancy rate: {int(round(sum(packet_sizes) / len(packet_sizes) * 50 / 1000))} kbps")
+
+# assemble features according to loss file
+if args.lossfile != None:
+    num_packets = len(packets)
+    loss = np.loadtxt(args.lossfile, dtype='int16')
+    fec_out = np.zeros((num_packets * 2, packets[0].shape[-1]), dtype='float32')
+    foffset = -2
+    ptr = 0
+    count = 2
+    for i in range(num_packets):
+        if (loss[i] == 0) or (i == num_packets - 1):
+            
+            fec_out[ptr:ptr+count,:] = packets[i][foffset:, :]
+
+            ptr    += count
+            foffset = -2
+            count   = 2
+        else:
+            count   += 2
+            foffset -= 2
+
+    fec_out_full = np.zeros((fec_out.shape[0], 36), dtype=np.float32)
+    fec_out_full[:, : fec_out.shape[-1]] = fec_out
+
+    fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
+    
+    
+if args.debug_output:
+    import itertools
+
+    batches = [4]
+    offsets = [0, 2 * args.num_redundancy_frames - 4]
+        
+    # sanity checks
+    # 1. concatenate features at offset 0
+    for batch, offset in itertools.product(batches, offsets):
+
+        stop = packets[0].shape[1] - offset
+        test_features = np.concatenate([packet[stop - batch: stop, :] for packet in packets[::batch//2]], axis=0)
+
+        test_features_full = np.zeros((test_features.shape[0], nb_features), dtype=np.float32)
+        test_features_full[:, :nb_used_features] = test_features[:, :]
+
+        print(f"writing debug output {packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32'}")
+        test_features_full.tofile(packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32')
+
diff --git a/dnn/torch/rdovae/import_rdovae_weights.py b/dnn/torch/rdovae/import_rdovae_weights.py
new file mode 100644
index 00000000..eba05018
--- /dev/null
+++ b/dnn/torch/rdovae/import_rdovae_weights.py
@@ -0,0 +1,143 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+import argparse
+
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('exchange_folder', type=str, help='exchange folder path')
+parser.add_argument('output', type=str, help='path to output model checkpoint')
+
+model_group = parser.add_argument_group(title="model parameters")
+model_group.add_argument('--num-features', type=int, help="number of features, default: 20", default=20)
+model_group.add_argument('--latent-dim', type=int, help="number of symbols produces by encoder, default: 80", default=80)
+model_group.add_argument('--cond-size', type=int, help="first conditioning size, default: 256", default=256)
+model_group.add_argument('--cond-size2', type=int, help="second conditioning size, default: 256", default=256)
+model_group.add_argument('--state-dim', type=int, help="dimensionality of transfered state, default: 24", default=24)
+model_group.add_argument('--quant-levels', type=int, help="number of quantization levels, default: 40", default=40)
+
+args = parser.parse_args()
+
+import torch
+from rdovae import RDOVAE
+from wexchange.torch import load_torch_weights
+
+exchange_name_to_name = {
+    'encoder_stack_layer1_dense'    : 'core_encoder.module.dense_1',
+    'encoder_stack_layer3_dense'    : 'core_encoder.module.dense_2',
+    'encoder_stack_layer5_dense'    : 'core_encoder.module.dense_3',
+    'encoder_stack_layer7_dense'    : 'core_encoder.module.dense_4',
+    'encoder_stack_layer8_dense'    : 'core_encoder.module.dense_5',
+    'encoder_state_layer1_dense'    : 'core_encoder.module.state_dense_1',
+    'encoder_state_layer2_dense'    : 'core_encoder.module.state_dense_2',
+    'encoder_stack_layer2_gru'      : 'core_encoder.module.gru_1',
+    'encoder_stack_layer4_gru'      : 'core_encoder.module.gru_2',
+    'encoder_stack_layer6_gru'      : 'core_encoder.module.gru_3',
+    'encoder_stack_layer9_conv'     : 'core_encoder.module.conv1',
+    'statistical_model_embedding'   : 'statistical_model.quant_embedding',
+    'decoder_state1_dense'          : 'core_decoder.module.gru_1_init',
+    'decoder_state2_dense'          : 'core_decoder.module.gru_2_init',
+    'decoder_state3_dense'          : 'core_decoder.module.gru_3_init',
+    'decoder_stack_layer1_dense'    : 'core_decoder.module.dense_1',
+    'decoder_stack_layer3_dense'    : 'core_decoder.module.dense_2',
+    'decoder_stack_layer5_dense'    : 'core_decoder.module.dense_3',
+    'decoder_stack_layer7_dense'    : 'core_decoder.module.dense_4',
+    'decoder_stack_layer8_dense'    : 'core_decoder.module.dense_5',
+    'decoder_stack_layer9_dense'    : 'core_decoder.module.output',
+    'decoder_stack_layer2_gru'      : 'core_decoder.module.gru_1',
+    'decoder_stack_layer4_gru'      : 'core_decoder.module.gru_2',
+    'decoder_stack_layer6_gru'      : 'core_decoder.module.gru_3'
+}
+
+if __name__ == "__main__":
+    checkpoint = dict()
+
+    # parameters
+    num_features    = args.num_features
+    latent_dim      = args.latent_dim
+    quant_levels    = args.quant_levels
+    cond_size       = args.cond_size
+    cond_size2      = args.cond_size2
+    state_dim       = args.state_dim
+    
+
+    # model
+    checkpoint['model_args']    = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
+    checkpoint['model_kwargs']  = {'state_dim': state_dim}
+    model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+
+    dense_layer_names = [
+        'encoder_stack_layer1_dense',
+        'encoder_stack_layer3_dense',
+        'encoder_stack_layer5_dense',
+        'encoder_stack_layer7_dense',
+        'encoder_stack_layer8_dense',
+        'encoder_state_layer1_dense',
+        'encoder_state_layer2_dense',
+        'decoder_state1_dense',      
+        'decoder_state2_dense',      
+        'decoder_state3_dense',      
+        'decoder_stack_layer1_dense',
+        'decoder_stack_layer3_dense',
+        'decoder_stack_layer5_dense',
+        'decoder_stack_layer7_dense',
+        'decoder_stack_layer8_dense',
+        'decoder_stack_layer9_dense'
+    ]
+
+    gru_layer_names = [
+        'encoder_stack_layer2_gru',
+        'encoder_stack_layer4_gru',
+        'encoder_stack_layer6_gru',
+        'decoder_stack_layer2_gru',
+        'decoder_stack_layer4_gru',
+        'decoder_stack_layer6_gru' 
+    ]
+
+    conv1d_layer_names = [
+        'encoder_stack_layer9_conv'
+    ]
+
+    embedding_layer_names = [
+        'statistical_model_embedding'
+    ]
+
+    for name in dense_layer_names + gru_layer_names + conv1d_layer_names + embedding_layer_names:
+        print(f"loading weights for layer {exchange_name_to_name[name]}")
+        layer = model.get_submodule(exchange_name_to_name[name])
+        load_torch_weights(os.path.join(args.exchange_folder, name), layer)
+
+    checkpoint['state_dict'] = model.state_dict()
+
+    torch.save(checkpoint, args.output)
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/libs/wexchange-1.0-py3-none-any.whl b/dnn/torch/rdovae/libs/wexchange-1.0-py3-none-any.whl
new file mode 100644
index 00000000..cfeebae5
--- /dev/null
+++ b/dnn/torch/rdovae/libs/wexchange-1.0-py3-none-any.whl
diff --git a/dnn/torch/rdovae/libs/wexchange-1.2-py3-none-any.whl b/dnn/torch/rdovae/libs/wexchange-1.2-py3-none-any.whl
new file mode 100644
index 00000000..ac34d7d9
--- /dev/null
+++ b/dnn/torch/rdovae/libs/wexchange-1.2-py3-none-any.whl
diff --git a/dnn/torch/rdovae/packets/__init__.py b/dnn/torch/rdovae/packets/__init__.py
new file mode 100644
index 00000000..fb71ab3d
--- /dev/null
+++ b/dnn/torch/rdovae/packets/__init__.py
@@ -0,0 +1 @@
+from .fec_packets import write_fec_packets, read_fec_packets
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/packets/fec_packets.c b/dnn/torch/rdovae/packets/fec_packets.c
new file mode 100644
index 00000000..376fb4f1
--- /dev/null
+++ b/dnn/torch/rdovae/packets/fec_packets.c
@@ -0,0 +1,142 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "fec_packets.h"
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index)
+{
+
+    int16_t version;
+    int16_t header_size;
+    int16_t num_packets;
+    int16_t packet_size;
+    int16_t subframe_size;
+    int16_t subframes_per_packet;
+    int16_t num_features;
+    long offset;
+
+    FILE *fid = fopen(filename, "rb");
+    
+    /* read header */
+    if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+    if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+    if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+    if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+    if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+    if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+    if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+    /* check if indices are valid */
+    if (packet_index >= num_packets || subframe_index >= subframes_per_packet)
+    {
+        fprintf(stderr, "get_fec_frame: index out of bounds\n");
+        goto error;
+    }
+
+    /* calculate offset in file (+ 2 is for rate) */
+    offset = header_size + packet_index * packet_size + 2 + subframe_index * subframe_size;
+    fseek(fid, offset, SEEK_SET);
+
+    /* read features */
+    if (fread(features, sizeof(*features), num_features, fid) != num_features) goto error;
+
+    fclose(fid);
+    return 0;
+
+error:
+    fclose(fid);
+    return 1;
+}
+
+int get_fec_rate(const char * const filename, int packet_index)
+{
+    int16_t version;
+    int16_t header_size;
+    int16_t num_packets;
+    int16_t packet_size;
+    int16_t subframe_size;
+    int16_t subframes_per_packet;
+    int16_t num_features;
+    long offset;
+    int16_t rate;
+
+    FILE *fid = fopen(filename, "rb");
+    
+    /* read header */
+    if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+    if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+    if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+    if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+    if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+    if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+    if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+    /* check if indices are valid */
+    if (packet_index >= num_packets)
+    {
+        fprintf(stderr, "get_fec_rate: index out of bounds\n");
+        goto error;
+    }
+
+    /* calculate offset in file (+ 2 is for rate) */
+    offset = header_size + packet_index * packet_size;
+    fseek(fid, offset, SEEK_SET);
+
+    /* read rate */
+    if (fread(&rate, sizeof(rate), 1, fid) != 1) goto error;
+
+    fclose(fid);
+    return (int) rate;
+
+error:
+    fclose(fid);
+    return -1;
+}
+
+#if 0
+int main()
+{
+    float features[20];
+    int i;
+
+    if (get_fec_frame("../test.fec", &features[0], 0, 127))
+    {
+        return 1;
+    }
+
+    for (i = 0; i < 20; i ++)
+    {
+        printf("%d %f\n", i, features[i]);
+    }
+
+    printf("rate: %d\n", get_fec_rate("../test.fec", 0));
+
+}
+#endif
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/packets/fec_packets.h b/dnn/torch/rdovae/packets/fec_packets.h
new file mode 100644
index 00000000..35d35542
--- /dev/null
+++ b/dnn/torch/rdovae/packets/fec_packets.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FEC_PACKETS_H
+#define _FEC_PACKETS_H
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index);
+int get_fec_rate(const char * const filename, int packet_index);
+
+#endif
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/packets/fec_packets.py b/dnn/torch/rdovae/packets/fec_packets.py
new file mode 100644
index 00000000..14bed1f8
--- /dev/null
+++ b/dnn/torch/rdovae/packets/fec_packets.py
@@ -0,0 +1,108 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import numpy as np
+
+
+
+def write_fec_packets(filename, packets, rates=None):
+    """ writes packets in binary format """
+    
+    assert np.dtype(np.float32).itemsize == 4
+    assert np.dtype(np.int16).itemsize == 2
+    
+    # derive some sizes 
+    num_packets             = len(packets)
+    subframes_per_packet    = packets[0].shape[-2]
+    num_features            = packets[0].shape[-1]
+    
+    # size of float is 4
+    subframe_size           = num_features * 4
+    packet_size             = subframe_size * subframes_per_packet + 2 # two bytes for rate
+    
+    version = 1
+    # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
+    header_size = 14
+    
+    with open(filename, 'wb') as f:
+        
+        # header
+        f.write(np.int16(version).tobytes())
+        f.write(np.int16(header_size).tobytes())
+        f.write(np.int16(num_packets).tobytes())
+        f.write(np.int16(packet_size).tobytes())
+        f.write(np.int16(subframe_size).tobytes())
+        f.write(np.int16(subframes_per_packet).tobytes())
+        f.write(np.int16(num_features).tobytes())
+        
+        # packets
+        for i, packet in enumerate(packets):
+            if type(rates) == type(None):
+                rate = 0
+            else:
+                rate = rates[i]
+            
+            f.write(np.int16(rate).tobytes())
+            
+            features = np.flip(packet, axis=-2)
+            f.write(features.astype(np.float32).tobytes())
+            
+        
+def read_fec_packets(filename):
+    """ reads packets from binary format """
+    
+    assert np.dtype(np.float32).itemsize == 4
+    assert np.dtype(np.int16).itemsize == 2
+    
+    with open(filename, 'rb') as f:
+        
+        # header
+        version                 = np.frombuffer(f.read(2), dtype=np.int16).item()
+        header_size             = np.frombuffer(f.read(2), dtype=np.int16).item()
+        num_packets             = np.frombuffer(f.read(2), dtype=np.int16).item()
+        packet_size             = np.frombuffer(f.read(2), dtype=np.int16).item()
+        subframe_size           = np.frombuffer(f.read(2), dtype=np.int16).item()
+        subframes_per_packet    = np.frombuffer(f.read(2), dtype=np.int16).item()
+        num_features            = np.frombuffer(f.read(2), dtype=np.int16).item()
+        
+        dummy_features          = np.zeros((subframes_per_packet, num_features), dtype=np.float32)
+        
+        # packets
+        rates = []
+        packets = []
+        for i in range(num_packets):
+                     
+            rate = np.frombuffer(f.read(2), dtype=np.int16).item
+            rates.append(rate)
+            
+            features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
+            packet = np.flip(features, axis=-2)
+            packets.append(packet)
+            
+    return packets
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/rdovae/__init__.py b/dnn/torch/rdovae/rdovae/__init__.py
new file mode 100644
index 00000000..b945adde
--- /dev/null
+++ b/dnn/torch/rdovae/rdovae/__init__.py
@@ -0,0 +1,2 @@
+from .rdovae import RDOVAE, distortion_loss, hard_rate_estimate, soft_rate_estimate
+from .dataset import RDOVAEDataset
diff --git a/dnn/torch/rdovae/rdovae/dataset.py b/dnn/torch/rdovae/rdovae/dataset.py
new file mode 100644
index 00000000..99630d8b
--- /dev/null
+++ b/dnn/torch/rdovae/rdovae/dataset.py
@@ -0,0 +1,68 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import torch
+import numpy as np
+
+class RDOVAEDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                feature_file,
+                sequence_length,
+                num_used_features=20,
+                num_features=36,
+                lambda_min=0.0002,
+                lambda_max=0.0135,
+                quant_levels=16,
+                enc_stride=2):
+        
+        self.sequence_length = sequence_length
+        self.lambda_min = lambda_min
+        self.lambda_max = lambda_max
+        self.enc_stride = enc_stride
+        self.quant_levels = quant_levels
+        self.denominator = (quant_levels - 1) / np.log(lambda_max / lambda_min)
+
+        if sequence_length % enc_stride:
+            raise ValueError(f"RDOVAEDataset.__init__: enc_stride {enc_stride} does not divide sequence length {sequence_length}")
+        
+        self.features = np.reshape(np.fromfile(feature_file, dtype=np.float32), (-1, num_features))
+        self.features = self.features[:, :num_used_features]
+        self.num_sequences = self.features.shape[0] // sequence_length
+
+    def __len__(self):
+        return self.num_sequences
+
+    def __getitem__(self, index):
+        features = self.features[index * self.sequence_length: (index + 1) * self.sequence_length, :]
+        q_ids = np.random.randint(0, self.quant_levels, (1)).astype(np.int64)
+        q_ids = np.repeat(q_ids, self.sequence_length // self.enc_stride, axis=0)
+        rate_lambda = self.lambda_min * np.exp(q_ids.astype(np.float32) / self.denominator).astype(np.float32)
+
+        return features, rate_lambda, q_ids
+
diff --git a/dnn/torch/rdovae/rdovae/rdovae.py b/dnn/torch/rdovae/rdovae/rdovae.py
new file mode 100644
index 00000000..b45d2b8c
--- /dev/null
+++ b/dnn/torch/rdovae/rdovae/rdovae.py
@@ -0,0 +1,614 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+""" Pytorch implementations of rate distortion optimized variational autoencoder """
+
+import math as m
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+# Quantization and rate related utily functions
+
+def soft_pvq(x, k):
+    """ soft pyramid vector quantizer """
+
+    # L2 normalization
+    x_norm2 = x / (1e-15 + torch.norm(x, dim=-1, keepdim=True))
+    
+
+    with torch.no_grad():
+        # quantization loop, no need to track gradients here
+        x_norm1 = x / torch.sum(torch.abs(x), dim=-1, keepdim=True)
+
+        # set initial scaling factor to k
+        scale_factor = k
+        x_scaled = scale_factor * x_norm1
+        x_quant = torch.round(x_scaled)
+
+        # we aim for ||x_quant||_L1 = k
+        for _ in range(10):
+            # remove signs and calculate L1 norm
+            abs_x_quant = torch.abs(x_quant)
+            abs_x_scaled = torch.abs(x_scaled)
+            l1_x_quant = torch.sum(abs_x_quant, axis=-1)
+
+            # increase, where target is too small and decrease, where target is too large
+            plus  = 1.0001 * torch.min((abs_x_quant + 0.5) / (abs_x_scaled + 1e-15), dim=-1).values
+            minus = 0.9999 * torch.max((abs_x_quant - 0.5) / (abs_x_scaled + 1e-15), dim=-1).values
+            factor = torch.where(l1_x_quant > k, minus, plus)
+            factor = torch.where(l1_x_quant == k, torch.ones_like(factor), factor)
+            scale_factor = scale_factor * factor.unsqueeze(-1)
+
+            # update x
+            x_scaled = scale_factor * x_norm1
+            x_quant = torch.round(x_quant)
+
+    # L2 normalization of quantized x
+    x_quant_norm2 = x_quant / (1e-15 + torch.norm(x_quant, dim=-1, keepdim=True))
+    quantization_error = x_quant_norm2 - x_norm2
+
+    return x_norm2 + quantization_error.detach()
+
+def cache_parameters(func):
+    cache = dict()
+    def cached_func(*args):
+        if args in cache:
+            return cache[args]
+        else:
+            cache[args] = func(*args)
+        
+        return cache[args]
+    return cached_func
+        
+@cache_parameters
+def pvq_codebook_size(n, k):
+    
+    if k == 0:
+        return 1
+    
+    if n == 0:
+        return 0
+    
+    return pvq_codebook_size(n - 1, k) + pvq_codebook_size(n, k - 1) + pvq_codebook_size(n - 1, k - 1)
+
+
+def soft_rate_estimate(z, r, reduce=True):
+    """ rate approximation with dependent theta Eq. (7)"""
+
+    rate = torch.sum(
+        - torch.log2((1 - r)/(1 + r) * r ** torch.abs(z) + 1e-6),
+        dim=-1
+    )
+
+    if reduce:
+        rate = torch.mean(rate)
+
+    return rate
+
+
+def hard_rate_estimate(z, r, theta, reduce=True):
+    """ hard rate approximation """
+
+    z_q = torch.round(z)
+    p0 = 1 - r ** (0.5 + 0.5 * theta)
+    alpha = torch.relu(1 - torch.abs(z_q)) ** 2
+    rate = - torch.sum(
+        (alpha * torch.log2(p0 * r ** torch.abs(z_q) + 1e-6) 
+        + (1 - alpha) * torch.log2(0.5 * (1 - p0) * (1 - r) * r ** (torch.abs(z_q) - 1) + 1e-6)),
+        dim=-1
+    )
+
+    if reduce:
+        rate = torch.mean(rate)
+
+    return rate
+
+
+
+def soft_dead_zone(x, dead_zone):
+    """ approximates application of a dead zone to x """
+    d = dead_zone * 0.05
+    return x - d * torch.tanh(x / (0.1 + d))
+
+
+def hard_quantize(x):
+    """ round with copy gradient trick """
+    return x + (torch.round(x) - x).detach()
+
+
+def noise_quantize(x):
+    """ simulates quantization with addition of random uniform noise """
+    return x + (torch.rand_like(x) - 0.5)
+
+
+# loss functions
+
+
+def distortion_loss(y_true, y_pred, rate_lambda=None):
+    """ custom distortion loss for LPCNet features """
+    
+    if y_true.size(-1) != 20:
+        raise ValueError('distortion loss is designed to work with 20 features')
+
+    ceps_error   = y_pred[..., :18] - y_true[..., :18]
+    pitch_error  = 2 * (y_pred[..., 18:19] - y_true[..., 18:19]) / (2 + y_true[..., 18:19])
+    corr_error   = y_pred[..., 19:] - y_true[..., 19:]
+    pitch_weight = torch.relu(y_true[..., 19:] + 0.5) ** 2
+
+    loss = torch.mean(ceps_error ** 2 + (10/18) * torch.abs(pitch_error) * pitch_weight + (1/18) * corr_error ** 2, dim=-1)
+
+    if type(rate_lambda) != type(None):
+        loss = loss / torch.sqrt(rate_lambda)
+
+    loss = torch.mean(loss)
+        
+    return loss
+
+
+# sampling functions
+
+import random
+
+
+def random_split(start, stop, num_splits=3, min_len=3):
+    get_min_len = lambda x : min([x[i+1] - x[i] for i in range(len(x) - 1)])
+    candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
+    
+    while get_min_len(candidate) < min_len: 
+        candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
+    
+    return candidate
+
+
+
+# weight initialization and clipping
+def init_weights(module):
+    
+    if isinstance(module, nn.GRU):
+        for p in module.named_parameters():
+            if p[0].startswith('weight_hh_'):
+                nn.init.orthogonal_(p[1])
+
+    
+def weight_clip_factory(max_value):
+    """ weight clipping function concerning sum of abs values of adjecent weights """
+    def clip_weight_(w):
+        stop = w.size(1)
+        # omit last column if stop is odd
+        if stop % 2:
+            stop -= 1
+        max_values = max_value * torch.ones_like(w[:, :stop])
+        factor = max_value / torch.maximum(max_values,
+                                 torch.repeat_interleave(
+                                     torch.abs(w[:, :stop:2]) + torch.abs(w[:, 1:stop:2]),
+                                     2,
+                                     1))
+        with torch.no_grad():
+            w[:, :stop] *= factor
+    
+    def clip_weights(module):
+        if isinstance(module, nn.GRU) or isinstance(module, nn.Linear):
+            for name, w in module.named_parameters():
+                if name.startswith('weight'):
+                    clip_weight_(w)
+    
+    return clip_weights
+
+# RDOVAE module and submodules
+
+
+class CoreEncoder(nn.Module):
+    STATE_HIDDEN = 128
+    FRAMES_PER_STEP = 2
+    CONV_KERNEL_SIZE = 4
+    
+    def __init__(self, feature_dim, output_dim, cond_size, cond_size2, state_size=24):
+        """ core encoder for RDOVAE
+        
+            Computes latents, initial states, and rate estimates from features and lambda parameter
+        
+        """
+
+        super(CoreEncoder, self).__init__()
+
+        # hyper parameters
+        self.feature_dim        = feature_dim
+        self.output_dim         = output_dim
+        self.cond_size          = cond_size
+        self.cond_size2         = cond_size2
+        self.state_size         = state_size
+
+        # derived parameters
+        self.input_dim = self.FRAMES_PER_STEP * self.feature_dim
+        self.conv_input_channels =  5 * cond_size + 3 * cond_size2
+
+        # layers
+        self.dense_1 = nn.Linear(self.input_dim, self.cond_size2)
+        self.gru_1   = nn.GRU(self.cond_size2, self.cond_size, batch_first=True)
+        self.dense_2 = nn.Linear(self.cond_size, self.cond_size2)
+        self.gru_2   = nn.GRU(self.cond_size2, self.cond_size, batch_first=True)
+        self.dense_3 = nn.Linear(self.cond_size, self.cond_size2)
+        self.gru_3   = nn.GRU(self.cond_size2, self.cond_size, batch_first=True)
+        self.dense_4 = nn.Linear(self.cond_size, self.cond_size)
+        self.dense_5 = nn.Linear(self.cond_size, self.cond_size)
+        self.conv1   = nn.Conv1d(self.conv_input_channels, self.output_dim, kernel_size=self.CONV_KERNEL_SIZE, padding='valid')
+
+        self.state_dense_1 = nn.Linear(self.conv_input_channels, self.STATE_HIDDEN)
+
+        self.state_dense_2 = nn.Linear(self.STATE_HIDDEN, self.state_size)
+
+        # initialize weights
+        self.apply(init_weights)
+
+
+    def forward(self, features):
+
+        # reshape features
+        x = torch.reshape(features, (features.size(0), features.size(1) // self.FRAMES_PER_STEP, self.FRAMES_PER_STEP * features.size(2)))
+
+        batch = x.size(0)
+        device = x.device
+
+        # run encoding layer stack
+        x1      = torch.tanh(self.dense_1(x))
+        x2, _   = self.gru_1(x1, torch.zeros((1, batch, self.cond_size)).to(device))
+        x3      = torch.tanh(self.dense_2(x2))
+        x4, _   = self.gru_2(x3, torch.zeros((1, batch, self.cond_size)).to(device))
+        x5      = torch.tanh(self.dense_3(x4))
+        x6, _   = self.gru_3(x5, torch.zeros((1, batch, self.cond_size)).to(device))
+        x7      = torch.tanh(self.dense_4(x6))
+        x8      = torch.tanh(self.dense_5(x7))
+
+        # concatenation of all hidden layer outputs
+        x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
+        
+        # init state for decoder
+        states = torch.tanh(self.state_dense_1(x9))
+        states = torch.tanh(self.state_dense_2(states))
+
+        # latent representation via convolution
+        x9 = F.pad(x9.permute(0, 2, 1), [self.CONV_KERNEL_SIZE - 1, 0])
+        z = self.conv1(x9).permute(0, 2, 1)
+
+        return z, states
+
+
+
+
+class CoreDecoder(nn.Module):
+
+    FRAMES_PER_STEP = 4
+
+    def __init__(self, input_dim, output_dim, cond_size, cond_size2, state_size=24):
+        """ core decoder for RDOVAE
+        
+            Computes features from latents, initial state, and quantization index
+        
+        """
+
+        super(CoreDecoder, self).__init__()
+
+        # hyper parameters
+        self.input_dim  = input_dim
+        self.output_dim = output_dim
+        self.cond_size  = cond_size
+        self.cond_size2 = cond_size2
+        self.state_size = state_size
+
+        self.input_size = self.input_dim
+        
+        self.concat_size = 4 * self.cond_size + 4 * self.cond_size2
+
+        # layers
+        self.dense_1    = nn.Linear(self.input_size, cond_size2)
+        self.gru_1      = nn.GRU(cond_size2, cond_size, batch_first=True)
+        self.dense_2    = nn.Linear(cond_size, cond_size2)
+        self.gru_2      = nn.GRU(cond_size2, cond_size, batch_first=True)
+        self.dense_3    = nn.Linear(cond_size, cond_size2)
+        self.gru_3      = nn.GRU(cond_size2, cond_size, batch_first=True)
+        self.dense_4    = nn.Linear(cond_size, cond_size2)
+        self.dense_5    = nn.Linear(cond_size2, cond_size2)
+
+        self.output  = nn.Linear(self.concat_size, self.FRAMES_PER_STEP * self.output_dim)
+
+
+        self.gru_1_init = nn.Linear(self.state_size, self.cond_size)
+        self.gru_2_init = nn.Linear(self.state_size, self.cond_size)
+        self.gru_3_init = nn.Linear(self.state_size, self.cond_size)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def forward(self, z, initial_state):
+        
+        gru_1_state = torch.tanh(self.gru_1_init(initial_state).permute(1, 0, 2))
+        gru_2_state = torch.tanh(self.gru_2_init(initial_state).permute(1, 0, 2))
+        gru_3_state = torch.tanh(self.gru_3_init(initial_state).permute(1, 0, 2))
+
+        # run decoding layer stack
+        x1  = torch.tanh(self.dense_1(z))
+        x2, _ = self.gru_1(x1, gru_1_state)
+        x3  = torch.tanh(self.dense_2(x2))
+        x4, _ = self.gru_2(x3, gru_2_state)
+        x5  = torch.tanh(self.dense_3(x4))
+        x6, _ = self.gru_3(x5, gru_3_state)
+        x7  = torch.tanh(self.dense_4(x6))
+        x8  = torch.tanh(self.dense_5(x7))
+        x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
+
+        # output layer and reshaping
+        x10 = self.output(x9)
+        features = torch.reshape(x10, (x10.size(0), x10.size(1) * self.FRAMES_PER_STEP, x10.size(2) // self.FRAMES_PER_STEP))
+
+        return features
+
+
+class StatisticalModel(nn.Module):
+    def __init__(self, quant_levels, latent_dim):
+        """ Statistical model for latent space
+        
+            Computes scaling, deadzone, r, and theta 
+        
+        """
+
+        super(StatisticalModel, self).__init__()
+
+        # copy parameters
+        self.latent_dim     = latent_dim
+        self.quant_levels   = quant_levels
+        self.embedding_dim  = 6 * latent_dim
+
+        # quantization embedding
+        self.quant_embedding    = nn.Embedding(quant_levels, self.embedding_dim)
+        
+        # initialize embedding to 0
+        with torch.no_grad():
+            self.quant_embedding.weight[:] = 0
+
+
+    def forward(self, quant_ids):
+        """ takes quant_ids and returns statistical model parameters"""
+
+        x = self.quant_embedding(quant_ids)
+
+        # CAVE: theta_soft is not used anymore. Kick it out?
+        quant_scale = F.softplus(x[..., 0 * self.latent_dim : 1 * self.latent_dim])
+        dead_zone   = F.softplus(x[..., 1 * self.latent_dim : 2 * self.latent_dim])
+        theta_soft  = torch.sigmoid(x[..., 2 * self.latent_dim : 3 * self.latent_dim])
+        r_soft      = torch.sigmoid(x[..., 3 * self.latent_dim : 4 * self.latent_dim])
+        theta_hard  = torch.sigmoid(x[..., 4 * self.latent_dim : 5 * self.latent_dim])
+        r_hard      = torch.sigmoid(x[..., 5 * self.latent_dim : 6 * self.latent_dim])
+        
+
+        return {
+            'quant_embedding'   : x,
+            'quant_scale'       : quant_scale,
+            'dead_zone'         : dead_zone,
+            'r_hard'            : r_hard,
+            'theta_hard'        : theta_hard,
+            'r_soft'            : r_soft,
+            'theta_soft'        : theta_soft
+        }
+
+
+class RDOVAE(nn.Module):
+    def __init__(self,
+                 feature_dim,
+                 latent_dim,
+                 quant_levels,
+                 cond_size,
+                 cond_size2,
+                 state_dim=24,
+                 split_mode='split',
+                 clip_weights=True,
+                 pvq_num_pulses=82,
+                 state_dropout_rate=0):
+
+        super(RDOVAE, self).__init__()
+
+        self.feature_dim    = feature_dim
+        self.latent_dim     = latent_dim
+        self.quant_levels   = quant_levels
+        self.cond_size      = cond_size
+        self.cond_size2     = cond_size2
+        self.split_mode     = split_mode
+        self.state_dim      = state_dim
+        self.pvq_num_pulses = pvq_num_pulses
+        self.state_dropout_rate = state_dropout_rate
+        
+        # submodules encoder and decoder share the statistical model
+        self.statistical_model = StatisticalModel(quant_levels, latent_dim)
+        self.core_encoder = nn.DataParallel(CoreEncoder(feature_dim, latent_dim, cond_size, cond_size2, state_size=state_dim))
+        self.core_decoder = nn.DataParallel(CoreDecoder(latent_dim, feature_dim, cond_size, cond_size2, state_size=state_dim))
+        
+        self.enc_stride = CoreEncoder.FRAMES_PER_STEP
+        self.dec_stride = CoreDecoder.FRAMES_PER_STEP
+       
+        if clip_weights:
+            self.weight_clip_fn = weight_clip_factory(0.496)
+        else:
+            self.weight_clip_fn = None
+        
+        if self.dec_stride % self.enc_stride != 0:
+            raise ValueError(f"get_decoder_chunks_generic: encoder stride does not divide decoder stride")
+    
+    def clip_weights(self):
+        if not type(self.weight_clip_fn) == type(None):
+            self.apply(self.weight_clip_fn)
+            
+    def get_decoder_chunks(self, z_frames, mode='split', chunks_per_offset = 4):
+        
+        enc_stride = self.enc_stride
+        dec_stride = self.dec_stride
+
+        stride = dec_stride // enc_stride
+        
+        chunks = []
+
+        for offset in range(stride):
+            # start is the smalles number = offset mod stride that decodes to a valid range
+            start = offset
+            while enc_stride * (start + 1) - dec_stride < 0:
+                start += stride
+
+            # check if start is a valid index
+            if start >= z_frames:
+                raise ValueError("get_decoder_chunks_generic: range too small")
+
+            # stop is the smallest number outside [0, num_enc_frames] that's congruent to offset mod stride
+            stop = z_frames - (z_frames % stride) + offset
+            while stop < z_frames:
+                stop += stride
+
+            # calculate split points
+            length = (stop - start)
+            if mode == 'split':
+                split_points = [start + stride * int(i * length / chunks_per_offset / stride) for i in range(chunks_per_offset)] + [stop]
+            elif mode == 'random_split':
+                split_points = [stride * x + start for x in random_split(0, (stop - start)//stride - 1, chunks_per_offset - 1, 1)]
+            else:
+                raise ValueError(f"get_decoder_chunks_generic: unknown mode {mode}")
+
+
+            for i in range(chunks_per_offset):
+                # (enc_frame_start, enc_frame_stop, enc_frame_stride, stride, feature_frame_start, feature_frame_stop)
+                # encoder range(i, j, stride) maps to feature range(enc_stride * (i + 1) - dec_stride, enc_stride * j)
+                # provided that i - j = 1 mod stride
+                chunks.append({
+                    'z_start'         : split_points[i],
+                    'z_stop'          : split_points[i + 1] - stride + 1,
+                    'z_stride'        : stride,
+                    'features_start'  : enc_stride * (split_points[i] + 1) - dec_stride,
+                    'features_stop'   : enc_stride * (split_points[i + 1] - stride + 1)
+                })
+
+        return chunks
+
+
+    def forward(self, features, q_id):
+
+        # calculate statistical model from quantization ID
+        statistical_model = self.statistical_model(q_id)
+
+        # run encoder
+        z, states = self.core_encoder(features)
+
+        # scaling, dead-zone and quantization
+        z = z * statistical_model['quant_scale']
+        z = soft_dead_zone(z, statistical_model['dead_zone'])
+
+        # quantization
+        z_q = hard_quantize(z) / statistical_model['quant_scale']
+        z_n = noise_quantize(z) / statistical_model['quant_scale']
+        states_q = soft_pvq(states, self.pvq_num_pulses)
+        
+        if self.state_dropout_rate > 0:
+            drop = torch.rand(states_q.size(0)) < self.state_dropout_rate
+            mask = torch.ones_like(states_q)
+            mask[drop] = 0
+            states_q = states_q * mask
+
+        # decoder
+        chunks = self.get_decoder_chunks(z.size(1), mode=self.split_mode)
+
+        outputs_hq = []
+        outputs_sq = []
+        for chunk in chunks:
+            # decoder with hard quantized input
+            z_dec_reverse       = torch.flip(z_q[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1])
+            dec_initial_state   = states_q[..., chunk['z_stop'] - 1 : chunk['z_stop'], :]
+            features_reverse = self.core_decoder(z_dec_reverse,  dec_initial_state)
+            outputs_hq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))
+
+
+            # decoder with soft quantized input
+            z_dec_reverse       = torch.flip(z_n[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :],  [1])
+            features_reverse    = self.core_decoder(z_dec_reverse, dec_initial_state)
+            outputs_sq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))          
+
+        return {
+            'outputs_hard_quant' : outputs_hq,
+            'outputs_soft_quant' : outputs_sq,
+            'z'                 : z,
+            'statistical_model' : statistical_model
+        }
+
+    def encode(self, features):
+        """ encoder with quantization and rate estimation """
+        
+        z, states = self.core_encoder(features)
+        
+        # quantization of initial states
+        states = soft_pvq(states, self.pvq_num_pulses)     
+        state_size = m.log2(pvq_codebook_size(self.state_dim, self.pvq_num_pulses))
+        
+        return z, states, state_size
+
+    def decode(self, z, initial_state):
+        """ decoder (flips sequences by itself) """
+        
+        z_reverse       = torch.flip(z, [1])
+        features_reverse = self.core_decoder(z_reverse, initial_state)
+        features = torch.flip(features_reverse, [1])
+        
+        return features
+        
+    def quantize(self, z, q_ids):
+        """ quantization of latent vectors """
+
+        stats = self.statistical_model(q_ids)
+
+        zq = z * stats['quant_scale']
+        zq = soft_dead_zone(zq, stats['dead_zone'])
+        zq = torch.round(zq)
+
+        sizes = hard_rate_estimate(zq, stats['r_hard'], stats['theta_hard'], reduce=False)
+
+        return zq, sizes
+
+    def unquantize(self, zq, q_ids):
+        """ re-scaling of latent vector """
+
+        stats = self.statistical_model(q_ids)
+
+        z = zq / stats['quant_scale']
+
+        return z
+    
+    def freeze_model(self):
+
+        # freeze all parameters
+        for p in self.parameters():
+            p.requires_grad = False
+        
+        for p in self.statistical_model.parameters():
+            p.requires_grad = True
+
diff --git a/dnn/torch/rdovae/requirements.txt b/dnn/torch/rdovae/requirements.txt
new file mode 100644
index 00000000..8afdcda3
--- /dev/null
+++ b/dnn/torch/rdovae/requirements.txt
@@ -0,0 +1,5 @@
+numpy
+scipy
+torch
+tqdm
+libs/wexchange-1.2-py3-none-any.whl
+\ No newline at end of file
diff --git a/dnn/torch/rdovae/train_rdovae.py b/dnn/torch/rdovae/train_rdovae.py
new file mode 100644
index 00000000..68ccf2eb
--- /dev/null
+++ b/dnn/torch/rdovae/train_rdovae.py
@@ -0,0 +1,270 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import argparse
+
+import torch
+import tqdm
+
+from rdovae import RDOVAE, RDOVAEDataset, distortion_loss, hard_rate_estimate, soft_rate_estimate
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('features', type=str, help='path to feature file in .f32 format')
+parser.add_argument('output', type=str, help='path to output folder')
+
+parser.add_argument('--cuda-visible-devices', type=str, help="comma separates list of cuda visible device indices, default: ''", default="")
+
+
+model_group = parser.add_argument_group(title="model parameters")
+model_group.add_argument('--latent-dim', type=int, help="number of symbols produces by encoder, default: 80", default=80)
+model_group.add_argument('--cond-size', type=int, help="first conditioning size, default: 256", default=256)
+model_group.add_argument('--cond-size2', type=int, help="second conditioning size, default: 256", default=256)
+model_group.add_argument('--state-dim', type=int, help="dimensionality of transfered state, default: 24", default=24)
+model_group.add_argument('--quant-levels', type=int, help="number of quantization levels, default: 16", default=16)
+model_group.add_argument('--lambda-min', type=float, help="minimal value for rate lambda, default: 0.0002", default=2e-4)
+model_group.add_argument('--lambda-max', type=float, help="maximal value for rate lambda, default: 0.0104", default=0.0104)
+model_group.add_argument('--pvq-num-pulses', type=int, help="number of pulses for PVQ, default: 82", default=82)
+model_group.add_argument('--state-dropout-rate', type=float, help="state dropout rate, default: 0", default=0.0)
+
+training_group = parser.add_argument_group(title="training parameters")
+training_group.add_argument('--batch-size', type=int, help="batch size, default: 32", default=32)
+training_group.add_argument('--lr', type=float, help='learning rate, default: 3e-4', default=3e-4)
+training_group.add_argument('--epochs', type=int, help='number of training epochs, default: 100', default=100)
+training_group.add_argument('--sequence-length', type=int, help='sequence length, needs to be divisible by 4, default: 256', default=256)
+training_group.add_argument('--lr-decay-factor', type=float, help='learning rate decay factor, default: 2.5e-5', default=2.5e-5)
+training_group.add_argument('--split-mode', type=str, choices=['split', 'random_split'], help='splitting mode for decoder input, default: split', default='split')
+training_group.add_argument('--enable-first-frame-loss', action='store_true', default=False, help='enables dedicated distortion loss on first 4 decoder frames')
+training_group.add_argument('--initial-checkpoint', type=str, help='initial checkpoint to start training from, default: None', default=None)
+training_group.add_argument('--train-decoder-only', action='store_true', help='freeze encoder and statistical model and train decoder only')
+
+args = parser.parse_args()
+
+# set visible devices
+os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_visible_devices
+
+# checkpoints
+checkpoint_dir = os.path.join(args.output, 'checkpoints')
+checkpoint = dict()
+os.makedirs(checkpoint_dir, exist_ok=True)
+
+# training parameters
+batch_size = args.batch_size
+lr = args.lr
+epochs = args.epochs
+sequence_length = args.sequence_length
+lr_decay_factor = args.lr_decay_factor
+split_mode = args.split_mode
+# not exposed
+adam_betas = [0.9, 0.99]
+adam_eps = 1e-8
+
+checkpoint['batch_size'] = batch_size
+checkpoint['lr'] = lr
+checkpoint['lr_decay_factor'] = lr_decay_factor 
+checkpoint['split_mode'] = split_mode
+checkpoint['epochs'] = epochs
+checkpoint['sequence_length'] = sequence_length
+checkpoint['adam_betas'] = adam_betas
+
+# logging
+log_interval = 10
+
+# device
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+# model parameters
+cond_size  = args.cond_size
+cond_size2 = args.cond_size2
+latent_dim = args.latent_dim
+quant_levels = args.quant_levels
+lambda_min = args.lambda_min
+lambda_max = args.lambda_max
+state_dim = args.state_dim
+# not expsed
+num_features = 20
+
+
+# training data
+feature_file = args.features
+
+# model
+checkpoint['model_args']    = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
+checkpoint['model_kwargs']  = {'state_dim': state_dim, 'split_mode' : split_mode, 'pvq_num_pulses': args.pvq_num_pulses, 'state_dropout_rate': args.state_dropout_rate}
+model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+
+if type(args.initial_checkpoint) != type(None):
+    checkpoint = torch.load(args.initial_checkpoint, map_location='cpu')
+    model.load_state_dict(checkpoint['state_dict'], strict=False)
+
+checkpoint['state_dict']    = model.state_dict()
+
+if args.train_decoder_only:
+    if args.initial_checkpoint is None:
+        print("warning: training decoder only without providing initial checkpoint")
+        
+    for p in model.core_encoder.module.parameters():
+        p.requires_grad = False
+        
+    for p in model.statistical_model.parameters():
+        p.requires_grad = False
+
+# dataloader
+checkpoint['dataset_args'] = (feature_file, sequence_length, num_features, 36)
+checkpoint['dataset_kwargs'] = {'lambda_min': lambda_min, 'lambda_max': lambda_max, 'enc_stride': model.enc_stride, 'quant_levels': quant_levels}
+dataset = RDOVAEDataset(*checkpoint['dataset_args'], **checkpoint['dataset_kwargs'])
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)
+
+
+
+# optimizer
+params = [p for p in model.parameters() if p.requires_grad]
+optimizer = torch.optim.Adam(params, lr=lr, betas=adam_betas, eps=adam_eps)
+
+
+# learning rate scheduler
+scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda x : 1 / (1 + lr_decay_factor * x))
+
+if __name__ == '__main__':
+
+    # push model to device
+    model.to(device)
+
+    # training loop
+
+    for epoch in range(1, epochs + 1):
+
+        print(f"training epoch {epoch}...")
+
+        # running stats
+        running_rate_loss       = 0
+        running_soft_dist_loss  = 0
+        running_hard_dist_loss  = 0
+        running_hard_rate_loss  = 0
+        running_soft_rate_loss  = 0
+        running_total_loss      = 0
+        running_rate_metric     = 0
+        previous_total_loss     = 0
+        running_first_frame_loss = 0
+
+        with tqdm.tqdm(dataloader, unit='batch') as tepoch:
+            for i, (features, rate_lambda, q_ids) in enumerate(tepoch):
+
+                # zero out gradients
+                optimizer.zero_grad()
+                
+                # push inputs to device
+                features    = features.to(device)
+                q_ids       = q_ids.to(device)
+                rate_lambda = rate_lambda.to(device)
+
+                
+                rate_lambda_upsamp = torch.repeat_interleave(rate_lambda, 2, 1)
+                
+                # run model
+                model_output = model(features, q_ids)
+
+                # collect outputs
+                z                   = model_output['z']
+                outputs_hard_quant  = model_output['outputs_hard_quant']
+                outputs_soft_quant  = model_output['outputs_soft_quant']
+                statistical_model   = model_output['statistical_model']
+
+                # rate loss
+                hard_rate = hard_rate_estimate(z, statistical_model['r_hard'], statistical_model['theta_hard'], reduce=False)
+                soft_rate = soft_rate_estimate(z, statistical_model['r_soft'], reduce=False)
+                soft_rate_loss = torch.mean(torch.sqrt(rate_lambda) * soft_rate)
+                hard_rate_loss = torch.mean(torch.sqrt(rate_lambda) * hard_rate)
+                rate_loss = (soft_rate_loss + 0.1 * hard_rate_loss)
+                hard_rate_metric = torch.mean(hard_rate)
+
+                ## distortion losses
+
+                # hard quantized decoder input
+                distortion_loss_hard_quant = torch.zeros_like(rate_loss)
+                for dec_features, start, stop in outputs_hard_quant:
+                    distortion_loss_hard_quant += distortion_loss(features[..., start : stop, :], dec_features, rate_lambda_upsamp[..., start : stop]) / len(outputs_hard_quant)
+
+                first_frame_loss = torch.zeros_like(rate_loss)
+                for dec_features, start, stop in outputs_hard_quant:
+                    first_frame_loss += distortion_loss(features[..., stop-4 : stop, :], dec_features[..., -4:, :], rate_lambda_upsamp[..., stop - 4 : stop]) / len(outputs_hard_quant)
+
+                # soft quantized decoder input
+                distortion_loss_soft_quant = torch.zeros_like(rate_loss)
+                for dec_features, start, stop in outputs_soft_quant:
+                    distortion_loss_soft_quant += distortion_loss(features[..., start : stop, :], dec_features, rate_lambda_upsamp[..., start : stop]) / len(outputs_soft_quant)
+
+                # total loss
+                total_loss = rate_loss + (distortion_loss_hard_quant + distortion_loss_soft_quant) / 2
+                
+                if args.enable_first_frame_loss:
+                    total_loss = total_loss + 0.5 * torch.relu(first_frame_loss - distortion_loss_hard_quant)
+                
+
+                total_loss.backward()
+                
+                optimizer.step()
+                
+                model.clip_weights()
+                
+                scheduler.step()
+
+                # collect running stats
+                running_hard_dist_loss  += float(distortion_loss_hard_quant.detach().cpu())
+                running_soft_dist_loss  += float(distortion_loss_soft_quant.detach().cpu())
+                running_rate_loss       += float(rate_loss.detach().cpu())
+                running_rate_metric     += float(hard_rate_metric.detach().cpu())
+                running_total_loss      += float(total_loss.detach().cpu())
+                running_first_frame_loss += float(first_frame_loss.detach().cpu())
+                running_soft_rate_loss += float(soft_rate_loss.detach().cpu())
+                running_hard_rate_loss += float(hard_rate_loss.detach().cpu())
+
+                if (i + 1) % log_interval == 0:
+                    current_loss = (running_total_loss - previous_total_loss) / log_interval
+                    tepoch.set_postfix(
+                        current_loss=current_loss,
+                        total_loss=running_total_loss / (i + 1),
+                        dist_hq=running_hard_dist_loss / (i + 1),
+                        dist_sq=running_soft_dist_loss / (i + 1),
+                        rate_loss=running_rate_loss / (i + 1),
+                        rate=running_rate_metric / (i + 1),
+                        ffloss=running_first_frame_loss / (i + 1),
+                        rateloss_hard=running_hard_rate_loss / (i + 1),
+                        rateloss_soft=running_soft_rate_loss / (i + 1)
+                    )
+                    previous_total_loss = running_total_loss
+
+        # save checkpoint
+        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
+        checkpoint['state_dict'] = model.state_dict()
+        checkpoint['loss'] = running_total_loss / len(dataloader)
+        checkpoint['epoch'] = epoch
+        torch.save(checkpoint, checkpoint_path)
diff --git a/dnn/training_tf2/dataloader.py b/dnn/training_tf2/dataloader.py
new file mode 100644
index 00000000..ed441c1e
--- /dev/null
+++ b/dnn/training_tf2/dataloader.py
@@ -0,0 +1,49 @@
+import numpy as np
+from tensorflow.keras.utils import Sequence
+from ulaw import lin2ulaw
+
+def lpc2rc(lpc):
+    #print("shape is = ", lpc.shape)
+    order = lpc.shape[-1]
+    rc = 0*lpc
+    for i in range(order, 0, -1):
+        rc[:,:,i-1] = lpc[:,:,-1]
+        ki = rc[:,:,i-1:i].repeat(i-1, axis=2)
+        lpc = (lpc[:,:,:-1] - ki*lpc[:,:,-2::-1])/(1-ki*ki)
+    return rc
+
+class LPCNetLoader(Sequence):
+    def __init__(self, data, features, periods, batch_size, e2e=False, lookahead=2):
+        self.batch_size = batch_size
+        self.nb_batches = np.minimum(np.minimum(data.shape[0], features.shape[0]), periods.shape[0])//self.batch_size
+        self.data = data[:self.nb_batches*self.batch_size, :]
+        self.features = features[:self.nb_batches*self.batch_size, :]
+        self.periods = periods[:self.nb_batches*self.batch_size, :]
+        self.e2e = e2e
+        self.lookahead = lookahead
+        self.on_epoch_end()
+
+    def on_epoch_end(self):
+        self.indices = np.arange(self.nb_batches*self.batch_size)
+        np.random.shuffle(self.indices)
+
+    def __getitem__(self, index):
+        data = self.data[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]
+        in_data = data[: , :, :1]
+        out_data = data[: , :, 1:]
+        features = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :-16]
+        periods = self.periods[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]
+        outputs = [out_data]
+        inputs = [in_data, features, periods]
+        if self.lookahead > 0:
+            lpc = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], 4-self.lookahead:-self.lookahead, -16:]
+        else:
+            lpc = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], 4:, -16:]
+        if self.e2e:
+            outputs.append(lpc2rc(lpc))
+        else:
+            inputs.append(lpc)
+        return (inputs, outputs)
+
+    def __len__(self):
+        return self.nb_batches
diff --git a/dnn/training_tf2/decode_rdovae.py b/dnn/training_tf2/decode_rdovae.py
new file mode 100644
index 00000000..f9bf9bf6
--- /dev/null
+++ b/dnn/training_tf2/decode_rdovae.py
@@ -0,0 +1,111 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2018-2019 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+#from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a PLC model')
+
+parser.add_argument('bits', metavar='<bits file>', help='binary features file (int16)')
+parser.add_argument('output', metavar='<output>', help='output features')
+parser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--weights', metavar='<input weights>', help='model weights')
+parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')
+parser.add_argument('--batch-size', metavar='<batch size>', default=1, type=int, help='batch size to use (default 128)')
+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')
+
+
+args = parser.parse_args()
+
+import importlib
+rdovae = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+from rdovae import pvq_quantize
+from rdovae import apply_dead_zone
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+model, encoder, decoder, qembedding = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+nbits=80
+
+
+bits_file = args.bits
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+bits = np.memmap(bits_file + "-syms.f32", dtype='float32', mode='r')
+nb_sequences = len(bits)//(40*sequence_size)//batch_size*batch_size
+bits = bits[:nb_sequences*sequence_size*40]
+
+bits = np.reshape(bits, (nb_sequences, sequence_size//2, 20*4))
+print(bits.shape)
+
+lambda_val = 0.001 * np.ones((nb_sequences, sequence_size//2, 1))
+quant_id = np.round(3.8*np.log(lambda_val/.0002)).astype('int16')
+quant_id = quant_id[:,:,0]
+quant_embed = qembedding(quant_id)
+quant_scale = tf.math.softplus(quant_embed[:,:,:nbits])
+dead_zone = tf.math.softplus(quant_embed[:, :, nbits : 2 * nbits])
+
+bits = bits*quant_scale
+bits = np.round(apply_dead_zone([bits, dead_zone]).numpy())
+bits = bits/quant_scale
+
+
+state = np.memmap(bits_file + "-state.f32", dtype='float32', mode='r')
+
+state = np.reshape(state, (nb_sequences, sequence_size//2, 24))
+state = state[:,-1,:]
+state = pvq_quantize(state, 82)
+#state = state/(1e-15+tf.norm(state, axis=-1,keepdims=True))
+
+print("shapes are:")
+print(bits.shape)
+print(state.shape)
+
+bits = bits[:,1::2,:]
+features = decoder.predict([bits, state], batch_size=batch_size)
+
+features.astype('float32').tofile(args.output)
diff --git a/dnn/training_tf2/diffembed.py b/dnn/training_tf2/diffembed.py
new file mode 100644
index 00000000..64f098e2
--- /dev/null
+++ b/dnn/training_tf2/diffembed.py
@@ -0,0 +1,49 @@
+"""
+Modification of Tensorflow's Embedding Layer:
+    1. Not restricted to be the first layer of a model
+    2. Differentiable (allows non-integer lookups)
+        - For non integer lookup, this layer linearly interpolates between the adjacent embeddings in the following way to preserver gradient flow
+            - E = (1 - frac(x))*embed(floor(x)) + frac(x)*embed(ceil(x)) 
+"""
+
+import tensorflow as tf
+from tensorflow.keras.layers import Layer
+
+class diff_Embed(Layer):
+    """
+    Parameters:
+        - units: int
+            Dimension of the Embedding
+        - dict_size: int
+            Number of Embeddings to lookup
+        - pcm_init: boolean
+            Initialized for the embedding matrix
+    """
+    def __init__(self, units=128, dict_size = 256, pcm_init = True, initializer = None, **kwargs):
+        super(diff_Embed, self).__init__(**kwargs)
+        self.units = units
+        self.dict_size = dict_size
+        self.pcm_init = pcm_init
+        self.initializer = initializer
+
+    def build(self, input_shape):  
+        w_init = tf.random_normal_initializer()
+        if self.pcm_init:  
+            w_init = self.initializer
+        self.w = tf.Variable(initial_value=w_init(shape=(self.dict_size, self.units),dtype='float32'),trainable=True)
+
+    def call(self, inputs):  
+        alpha = inputs - tf.math.floor(inputs)
+        alpha = tf.expand_dims(alpha,axis = -1)
+        alpha = tf.tile(alpha,[1,1,1,self.units])
+        inputs = tf.cast(inputs,'int32')
+        M = (1 - alpha)*tf.gather(self.w,inputs) + alpha*tf.gather(self.w,tf.clip_by_value(inputs + 1, 0, 255))
+        return M
+
+    def get_config(self):
+        config = super(diff_Embed, self).get_config()
+        config.update({"units": self.units})
+        config.update({"dict_size" : self.dict_size})
+        config.update({"pcm_init" : self.pcm_init})
+        config.update({"initializer" : self.initializer})
+        return config
+\ No newline at end of file
diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py
new file mode 100755
index 00000000..ea63ee8b
--- /dev/null
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -0,0 +1,388 @@
+#!/usr/bin/python3
+'''Copyright (c) 2017-2018 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import os
+import io
+import lpcnet
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.layers import Layer, GRU, Dense, Conv1D, Embedding
+from ulaw import ulaw2lin, lin2ulaw
+from mdense import MDense
+from diffembed import diff_Embed
+from parameters import get_parameter
+import h5py
+import re
+import argparse
+
+
+# no cuda devices needed
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+# Flag for dumping e2e (differentiable lpc) network weights
+flag_e2e = False
+
+
+max_rnn_neurons = 1
+max_conv_inputs = 1
+max_mdense_tmp = 1
+
+def printVector(f, vector, name, dtype='float', dotp=False):
+    global array_list
+    if dotp:
+        vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+        vector = vector.transpose((2, 0, 3, 1))
+    v = np.reshape(vector, (-1));
+    #print('static const float ', name, '[', len(v), '] = \n', file=f)
+    if name not in array_list:
+        array_list.append(name)
+    f.write('#ifndef USE_WEIGHTS_FILE\n')
+    f.write('#define WEIGHTS_{}_DEFINED\n'.format(name))
+    f.write('#define WEIGHTS_{}_TYPE WEIGHT_TYPE_{}\n'.format(name, dtype))
+    f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))
+    for i in range(0, len(v)):
+        f.write('{}'.format(v[i]))
+        if (i!=len(v)-1):
+            f.write(',')
+        else:
+            break;
+        if (i%8==7):
+            f.write("\n   ")
+        else:
+            f.write(" ")
+    #print(v, file=f)
+    f.write('\n};\n')
+    f.write('#endif\n\n')
+    return;
+
+def printSparseVector(f, A, name, have_diag=True):
+    N = A.shape[0]
+    M = A.shape[1]
+    W = np.zeros((0,), dtype='int')
+    W0 = np.zeros((0,))
+    if have_diag:
+        diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
+        A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
+        A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
+        A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+        printVector(f, diag, name + '_diag')
+    AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')
+    idx = np.zeros((0,), dtype='int')
+    for i in range(M//8):
+        pos = idx.shape[0]
+        idx = np.append(idx, -1)
+        nb_nonzero = 0
+        for j in range(N//4):
+            block = A[j*4:(j+1)*4, i*8:(i+1)*8]
+            qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]
+            if np.sum(np.abs(block)) > 1e-10:
+                nb_nonzero = nb_nonzero + 1
+                idx = np.append(idx, j*4)
+                vblock = qblock.transpose((1,0)).reshape((-1,))
+                W0 = np.concatenate([W0, block.reshape((-1,))])
+                W = np.concatenate([W, vblock])
+        idx[pos] = nb_nonzero
+    f.write('#ifdef DOT_PROD\n')
+    printVector(f, W, name, dtype='qweight')
+    f.write('#else /*DOT_PROD*/\n')
+    printVector(f, W0, name, dtype='qweight')
+    f.write('#endif /*DOT_PROD*/\n')
+    #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)
+    printVector(f, idx, name + '_idx', dtype='int')
+    return AQ
+
+def dump_layer_ignore(self, f, hf):
+    print("ignoring layer " + self.name + " of type " + self.__class__.__name__)
+    return False
+Layer.dump_layer = dump_layer_ignore
+
+def dump_sparse_gru(self, f, hf):
+    global max_rnn_neurons
+    name = 'sparse_' + self.name
+    print("printing layer " + name + " of type sparse " + self.__class__.__name__)
+    weights = self.get_weights()
+    qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')
+    printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)
+    printVector(f, subias, name + '_subias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = max(max_rnn_neurons, neurons)
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    model_struct.write('  SparseGRULayer {};\n'.format(name));
+    model_init.write('  if (sparse_gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_recurrent_weights_diag", "{}_recurrent_weights", "{}_recurrent_weights_idx",  {}, ACTIVATION_{}, {})) return 1;\n'
+            .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+    return True
+
+def dump_grub(self, f, hf, gru_a_size):
+    global max_rnn_neurons
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    qweight = printSparseVector(f, weights[0][:gru_a_size, :], name + '_weights', have_diag=False)
+
+    f.write('#ifdef DOT_PROD\n')
+    qweight2 = np.clip(np.round(128.*weights[1]).astype('int'), -128, 127)
+    printVector(f, qweight2, name + '_recurrent_weights', dotp=True, dtype='qweight')
+    f.write('#else /*DOT_PROD*/\n')
+    printVector(f, weights[1], name + '_recurrent_weights')
+    f.write('#endif /*DOT_PROD*/\n')
+
+    printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)
+    subias[1,:] = subias[1,:] - np.sum(qweight2*(1./128.),axis=0)
+    printVector(f, subias, name + '_subias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = max(max_rnn_neurons, neurons)
+    model_struct.write('  GRULayer {};\n'.format(name));
+    model_init.write('  if (gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_weights", "{}_weights_idx", "{}_recurrent_weights", {}, {}, ACTIVATION_{}, {})) return 1;\n'
+            .format(name, name, name, name, name, name, gru_a_size, weights[0].shape[1]//3, activation, reset_after))
+    return True
+
+def dump_gru_layer_dummy(self, f, hf):
+    name = self.name
+    weights = self.get_weights()
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    return True;
+
+GRU.dump_layer = dump_gru_layer_dummy
+
+def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
+    printVector(f, weights, name + '_weights')
+    printVector(f, bias, name + '_bias')
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+    model_struct.write('  DenseLayer {};\n'.format(name));
+    model_init.write('  if (dense_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, ACTIVATION_{})) return 1;\n'
+            .format(name, name, name, weights.shape[0], weights.shape[1], activation))
+
+def dump_dense_layer(self, f, hf):
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    activation = self.activation.__name__.upper()
+    dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
+    return False
+
+Dense.dump_layer = dump_dense_layer
+
+def dump_mdense_layer(self, f, hf):
+    global max_mdense_tmp
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    printVector(f, np.transpose(weights[0], (0, 2, 1)), name + '_weights')
+    printVector(f, np.transpose(weights[1], (1, 0)), name + '_bias')
+    printVector(f, np.transpose(weights[2], (1, 0)), name + '_factor')
+    activation = self.activation.__name__.upper()
+    max_mdense_tmp = max(max_mdense_tmp, weights[0].shape[0]*weights[0].shape[2])
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[0]))
+    model_struct.write('  MDenseLayer {};\n'.format(name));
+    model_init.write('  if (mdense_init(&model->{}, arrays, "{}_bias",  "{}_weights",  "{}_factor",  {}, {}, {}, ACTIVATION_{})) return 1;\n'
+            .format(name, name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
+    return False
+MDense.dump_layer = dump_mdense_layer
+
+def dump_conv1d_layer(self, f, hf):
+    global max_conv_inputs
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    printVector(f, weights[0], name + '_weights')
+    printVector(f, weights[-1], name + '_bias')
+    activation = self.activation.__name__.upper()
+    max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))
+    hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))
+    hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))
+    model_struct.write('  Conv1DLayer {};\n'.format(name));
+    model_init.write('  if (conv1d_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, {}, ACTIVATION_{})) return 1;\n'
+            .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
+    return True
+Conv1D.dump_layer = dump_conv1d_layer
+
+
+def dump_embedding_layer_impl(name, weights, f, hf):
+    printVector(f, weights, name + '_weights')
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+    model_struct.write('  EmbeddingLayer {};\n'.format(name));
+    model_init.write('  if (embedding_init(&model->{}, arrays, "{}_weights", {}, {})) return 1;\n'
+            .format(name, name, weights.shape[0], weights.shape[1]))
+
+def dump_embedding_layer(self, f, hf):
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()[0]
+    dump_embedding_layer_impl(name, weights, f, hf)
+    return False
+Embedding.dump_layer = dump_embedding_layer
+diff_Embed.dump_layer = dump_embedding_layer
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model_file', type=str, help='model weight h5 file')
+    parser.add_argument('--nnet-header', type=str, help='name of c header file for dumped model', default='nnet_data.h')
+    parser.add_argument('--nnet-source', type=str, help='name of c source file for dumped model', default='nnet_data.c')
+    parser.add_argument('--lpc-gamma', type=float, help='LPC weighting factor. If not specified I will attempt to read it from the model file with 1 as default', default=None)
+    parser.add_argument('--lookahead', type=float, help='Features lookahead. If not specified I will attempt to read it from the model file with 2 as default', default=None)
+
+    args = parser.parse_args()
+
+    filename = args.model_file
+    with h5py.File(filename, "r") as f:
+        units = min(f['model_weights']['gru_a']['gru_a']['recurrent_kernel:0'].shape)
+        units2 = min(f['model_weights']['gru_b']['gru_b']['recurrent_kernel:0'].shape)
+        cond_size = min(f['model_weights']['feature_dense1']['feature_dense1']['kernel:0'].shape)
+        e2e = 'rc2lpc' in f['model_weights']
+
+    model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = e2e, cond_size=cond_size)
+    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+    #model.summary()
+
+    model.load_weights(filename, by_name=True)
+
+    cfile = args.nnet_source
+    hfile = args.nnet_header
+
+    f = open(cfile, 'w')
+    hf = open(hfile, 'w')
+    model_struct = io.StringIO()
+    model_init = io.StringIO()
+    model_struct.write('typedef struct {\n')
+    model_init.write('#ifndef DUMP_BINARY_WEIGHTS\n')
+    model_init.write('int init_lpcnet_model(LPCNetModel *model, const WeightArray *arrays) {\n')
+    array_list = []
+
+    f.write('/*This file is automatically generated from a Keras model*/\n')
+    f.write('/*based on model {}*/\n\n'.format(sys.argv[1]))
+    f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile))
+
+    hf.write('/*This file is automatically generated from a Keras model*/\n\n')
+    hf.write('#ifndef RNN_DATA_H\n#define RNN_DATA_H\n\n#include "nnet.h"\n\n')
+
+    if e2e:
+        hf.write('/* This is an end-to-end model */\n')
+        hf.write('#define END2END\n\n')
+    else:
+        hf.write('/* This is *not* an end-to-end model */\n')
+        hf.write('/* #define END2END */\n\n')
+    
+    # LPC weighting factor
+    if type(args.lpc_gamma) == type(None):
+        lpc_gamma = get_parameter(model, 'lpc_gamma', 1)
+    else:
+        lpc_gamma = args.lpc_gamma
+    
+    hf.write('/* LPC weighting factor */\n')
+    hf.write('#define LPC_GAMMA ' + str(lpc_gamma) +'f\n\n')
+
+    # look-ahead
+    if type(args.lookahead) == type(None):
+        lookahead = get_parameter(model, 'lookahead', 2)
+    else:
+        lookahead = args.lookahead
+
+    hf.write('/* Features look-ahead */\n')
+    hf.write('#define FEATURES_DELAY ' + str(lookahead) +'\n\n')
+
+    embed_size = lpcnet.embed_size
+
+    E = model.get_layer('embed_sig').get_weights()[0]
+    W = model.get_layer('gru_a').get_weights()[0][:embed_size,:]
+    dump_embedding_layer_impl('gru_a_embed_sig', np.dot(E, W), f, hf)
+    W = model.get_layer('gru_a').get_weights()[0][embed_size:2*embed_size,:]
+    dump_embedding_layer_impl('gru_a_embed_pred', np.dot(E, W), f, hf)
+    W = model.get_layer('gru_a').get_weights()[0][2*embed_size:3*embed_size,:]
+    dump_embedding_layer_impl('gru_a_embed_exc', np.dot(E, W), f, hf)
+    W = model.get_layer('gru_a').get_weights()[0][3*embed_size:,:]
+    #FIXME: dump only half the biases
+    b = model.get_layer('gru_a').get_weights()[2]
+    dump_dense_layer_impl('gru_a_dense_feature', W, b[:len(b)//2], 'LINEAR', f, hf)
+
+    W = model.get_layer('gru_b').get_weights()[0][model.rnn_units1:,:]
+    b = model.get_layer('gru_b').get_weights()[2]
+    # Set biases to zero because they'll be included in the GRU input part
+    # (we need regular and SU biases)
+    dump_dense_layer_impl('gru_b_dense_feature', W, 0*b[:len(b)//2], 'LINEAR', f, hf)
+    dump_grub(model.get_layer('gru_b'), f, hf, model.rnn_units1)
+
+    layer_list = []
+    for i, layer in enumerate(model.layers):
+        if layer.dump_layer(f, hf):
+            layer_list.append(layer.name)
+
+    dump_sparse_gru(model.get_layer('gru_a'), f, hf)
+
+    f.write('#ifndef USE_WEIGHTS_FILE\n')
+    f.write('const WeightArray lpcnet_arrays[] = {\n')
+    for name in array_list:
+        f.write('#ifdef WEIGHTS_{}_DEFINED\n'.format(name))
+        f.write('  {{"{}", WEIGHTS_{}_TYPE, sizeof({}), {}}},\n'.format(name, name, name, name))
+        f.write('#endif\n')
+    f.write('  {NULL, 0, 0, NULL}\n};\n')
+    f.write('#endif\n')
+
+    model_init.write('  return 0;\n}\n')
+    model_init.write('#endif\n')
+    f.write(model_init.getvalue())
+
+    hf.write('#define MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))
+    hf.write('#define MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))
+    hf.write('#define MAX_MDENSE_TMP {}\n\n'.format(max_mdense_tmp))
+
+
+    hf.write('typedef struct {\n')
+    for i, name in enumerate(layer_list):
+        hf.write('  float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) 
+    hf.write('} NNetState;\n\n')
+
+    model_struct.write('} LPCNetModel;\n\n')
+    hf.write(model_struct.getvalue())
+    hf.write('int init_lpcnet_model(LPCNetModel *model, const WeightArray *arrays);\n\n')
+    hf.write('\n\n#endif\n')
+
+    f.close()
+    hf.close()
diff --git a/dnn/training_tf2/dump_plc.py b/dnn/training_tf2/dump_plc.py
new file mode 100755
index 00000000..8bd8cfb1
--- /dev/null
+++ b/dnn/training_tf2/dump_plc.py
@@ -0,0 +1,296 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2017-2018 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import lpcnet_plc
+import io
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.layers import Layer, GRU, Dense, Conv1D, Embedding
+import h5py
+import re
+
+# Flag for dumping e2e (differentiable lpc) network weights
+flag_e2e = False
+
+max_rnn_neurons = 1
+max_conv_inputs = 1
+
+def printVector(f, vector, name, dtype='float', dotp=False):
+    global array_list
+    if dotp:
+        vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+        vector = vector.transpose((2, 0, 3, 1))
+    v = np.reshape(vector, (-1));
+    #print('static const float ', name, '[', len(v), '] = \n', file=f)
+    if name not in array_list:
+        array_list.append(name)
+    f.write('#ifndef USE_WEIGHTS_FILE\n')
+    f.write('#define WEIGHTS_{}_DEFINED\n'.format(name))
+    f.write('#define WEIGHTS_{}_TYPE WEIGHT_TYPE_{}\n'.format(name, dtype))
+    f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))
+    for i in range(0, len(v)):
+        f.write('{}'.format(v[i]))
+        if (i!=len(v)-1):
+            f.write(',')
+        else:
+            break;
+        if (i%8==7):
+            f.write("\n   ")
+        else:
+            f.write(" ")
+    #print(v, file=f)
+    f.write('\n};\n')
+    f.write('#endif\n\n')
+    return;
+
+def printSparseVector(f, A, name, have_diag=True):
+    N = A.shape[0]
+    M = A.shape[1]
+    W = np.zeros((0,), dtype='int')
+    W0 = np.zeros((0,))
+    if have_diag:
+        diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
+        A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
+        A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
+        A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+        printVector(f, diag, name + '_diag')
+    AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')
+    idx = np.zeros((0,), dtype='int')
+    for i in range(M//8):
+        pos = idx.shape[0]
+        idx = np.append(idx, -1)
+        nb_nonzero = 0
+        for j in range(N//4):
+            block = A[j*4:(j+1)*4, i*8:(i+1)*8]
+            qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]
+            if np.sum(np.abs(block)) > 1e-10:
+                nb_nonzero = nb_nonzero + 1
+                idx = np.append(idx, j*4)
+                vblock = qblock.transpose((1,0)).reshape((-1,))
+                W0 = np.concatenate([W0, block.reshape((-1,))])
+                W = np.concatenate([W, vblock])
+        idx[pos] = nb_nonzero
+    f.write('#ifdef DOT_PROD\n')
+    printVector(f, W, name, dtype='qweight')
+    f.write('#else /*DOT_PROD*/\n')
+    printVector(f, W0, name, dtype='qweight')
+    f.write('#endif /*DOT_PROD*/\n')
+    #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)
+    printVector(f, idx, name + '_idx', dtype='int')
+    return AQ
+
+def dump_layer_ignore(self, f, hf):
+    print("ignoring layer " + self.name + " of type " + self.__class__.__name__)
+    return False
+Layer.dump_layer = dump_layer_ignore
+
+def dump_sparse_gru(self, f, hf):
+    global max_rnn_neurons
+    name = 'sparse_' + self.name
+    print("printing layer " + name + " of type sparse " + self.__class__.__name__)
+    weights = self.get_weights()
+    qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')
+    printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)
+    printVector(f, subias, name + '_subias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = max(max_rnn_neurons, neurons)
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    model_struct.write('  SparseGRULayer {};\n'.format(name));
+    model_init.write('  if (sparse_gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_recurrent_weights_diag", "{}_recurrent_weights", "{}_recurrent_weights_idx",  {}, ACTIVATION_{}, {})) return 1;\n'
+            .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+    return True
+
+def dump_gru_layer(self, f, hf):
+    global max_rnn_neurons
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    qweight = printSparseVector(f, weights[0], name + '_weights', have_diag=False)
+
+    f.write('#ifdef DOT_PROD\n')
+    qweight2 = np.clip(np.round(128.*weights[1]).astype('int'), -128, 127)
+    printVector(f, qweight2, name + '_recurrent_weights', dotp=True, dtype='qweight')
+    f.write('#else /*DOT_PROD*/\n')
+    printVector(f, weights[1], name + '_recurrent_weights')
+    f.write('#endif /*DOT_PROD*/\n')
+
+    printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)
+    subias[1,:] = subias[1,:] - np.sum(qweight2*(1./128.),axis=0)
+    printVector(f, subias, name + '_subias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = max(max_rnn_neurons, neurons)
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    model_struct.write('  GRULayer {};\n'.format(name));
+    model_init.write('  if (gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_weights", "{}_weights_idx", "{}_recurrent_weights", {}, {}, ACTIVATION_{}, {})) return 1;\n'
+             .format(name, name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
+    return True
+GRU.dump_layer = dump_gru_layer
+
+def dump_gru_layer_dummy(self, f, hf):
+    name = self.name
+    weights = self.get_weights()
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    return True;
+
+#GRU.dump_layer = dump_gru_layer_dummy
+
+def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
+    printVector(f, weights, name + '_weights')
+    printVector(f, bias, name + '_bias')
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+    model_struct.write('  DenseLayer {};\n'.format(name));
+    model_init.write('  if (dense_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, ACTIVATION_{})) return 1;\n'
+            .format(name, name, name, weights.shape[0], weights.shape[1], activation))
+
+def dump_dense_layer(self, f, hf):
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    activation = self.activation.__name__.upper()
+    dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
+    return False
+
+Dense.dump_layer = dump_dense_layer
+
+def dump_conv1d_layer(self, f, hf):
+    global max_conv_inputs
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    printVector(f, weights[0], name + '_weights')
+    printVector(f, weights[-1], name + '_bias')
+    activation = self.activation.__name__.upper()
+    max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))
+    hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))
+    hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))
+    model_struct.write('  Conv1DLayer {};\n'.format(name));
+    model_init.write('  if (conv1d_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, {}, ACTIVATION_{})) return 1;\n'
+            .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
+    return True
+Conv1D.dump_layer = dump_conv1d_layer
+
+
+
+filename = sys.argv[1]
+with h5py.File(filename, "r") as f:
+    units = min(f['model_weights']['plc_gru1']['plc_gru1']['recurrent_kernel:0'].shape)
+    units2 = min(f['model_weights']['plc_gru2']['plc_gru2']['recurrent_kernel:0'].shape)
+    cond_size = f['model_weights']['plc_dense1']['plc_dense1']['kernel:0'].shape[1]
+
+model = lpcnet_plc.new_lpcnet_plc_model(rnn_units=units, cond_size=cond_size)
+model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+#model.summary()
+
+model.load_weights(filename, by_name=True)
+
+if len(sys.argv) > 2:
+    cfile = sys.argv[2];
+    hfile = sys.argv[3];
+else:
+    cfile = 'plc_data.c'
+    hfile = 'plc_data.h'
+
+
+f = open(cfile, 'w')
+hf = open(hfile, 'w')
+model_struct = io.StringIO()
+model_init = io.StringIO()
+model_struct.write('typedef struct {\n')
+model_init.write('#ifndef DUMP_BINARY_WEIGHTS\n')
+model_init.write('int init_plc_model(PLCModel *model, const WeightArray *arrays) {\n')
+array_list = []
+
+
+f.write('/*This file is automatically generated from a Keras model*/\n')
+f.write('/*based on model {}*/\n\n'.format(sys.argv[1]))
+f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile))
+
+hf.write('/*This file is automatically generated from a Keras model*/\n\n')
+hf.write('#ifndef PLC_DATA_H\n#define PLC_DATA_H\n\n#include "nnet.h"\n\n')
+
+layer_list = []
+for i, layer in enumerate(model.layers):
+    if layer.dump_layer(f, hf):
+        layer_list.append(layer.name)
+
+#dump_sparse_gru(model.get_layer('gru_a'), f, hf)
+f.write('#ifndef USE_WEIGHTS_FILE\n')
+f.write('const WeightArray lpcnet_plc_arrays[] = {\n')
+for name in array_list:
+    f.write('#ifdef WEIGHTS_{}_DEFINED\n'.format(name))
+    f.write('  {{"{}", WEIGHTS_{}_TYPE, sizeof({}), {}}},\n'.format(name, name, name, name))
+    f.write('#endif\n')
+f.write('  {NULL, 0, 0, NULL}\n};\n')
+f.write('#endif\n')
+
+model_init.write('  return 0;\n}\n')
+model_init.write('#endif\n')
+f.write(model_init.getvalue())
+
+
+hf.write('#define PLC_MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))
+#hf.write('#define PLC_MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))
+
+hf.write('typedef struct {\n')
+for i, name in enumerate(layer_list):
+    hf.write('  float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) 
+hf.write('} PLCNetState;\n\n')
+
+model_struct.write('} PLCModel;\n\n')
+hf.write(model_struct.getvalue())
+hf.write('int init_plc_model(PLCModel *model, const WeightArray *arrays);\n\n')
+
+hf.write('\n\n#endif\n')
+
+f.close()
+hf.close()
diff --git a/dnn/training_tf2/dump_rdovae.py b/dnn/training_tf2/dump_rdovae.py
new file mode 100644
index 00000000..778a6899
--- /dev/null
+++ b/dnn/training_tf2/dump_rdovae.py
@@ -0,0 +1,306 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import argparse
+from ftplib import parse150
+import os
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('weights', metavar="<weight file>", type=str, help='model weight file in hdf5 format')
+parser.add_argument('--cond-size', type=int, help="conditioning size (default: 256)", default=256)
+parser.add_argument('--latent-dim', type=int, help="dimension of latent space (default: 80)", default=80)
+parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 16)", default=16)
+
+args = parser.parse_args()
+
+# now import the heavy stuff
+import tensorflow as tf
+import numpy as np
+from keraslayerdump import dump_conv1d_layer, dump_dense_layer, dump_gru_layer, printVector
+from rdovae import new_rdovae_model
+
+def start_header(header_fid, header_name):
+    header_guard = os.path.basename(header_name)[:-2].upper() + "_H"
+    header_fid.write(
+f"""
+#ifndef {header_guard}
+#define {header_guard}
+
+"""
+    )
+
+def finish_header(header_fid):
+    header_fid.write(
+"""
+#endif
+
+"""
+    )
+
+def start_source(source_fid, header_name, weight_file):
+    source_fid.write(
+f"""
+/* this source file was automatically generated from weight file {weight_file} */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "{header_name}"
+
+"""
+    )
+
+def finish_source(source_fid):
+    pass
+
+
+def dump_statistical_model(qembedding, f, fh):
+    w = qembedding.weights[0].numpy()
+    levels, dim = w.shape
+    N = dim // 6
+
+    print("dumping statistical model")
+    quant_scales    = tf.math.softplus(w[:, : N]).numpy()
+    dead_zone       = 0.05 * tf.math.softplus(w[:, N : 2 * N]).numpy()
+    r               = tf.math.sigmoid(w[:, 5 * N : 6 * N]).numpy()
+    p0              = tf.math.sigmoid(w[:, 4 * N : 5 * N]).numpy()
+    p0              = 1 - r ** (0.5 + 0.5 * p0)
+
+    quant_scales_q8 = np.round(quant_scales * 2**8).astype(np.uint16)
+    dead_zone_q10   = np.round(dead_zone * 2**10).astype(np.uint16)
+    r_q15           = np.round(r * 2**15).astype(np.uint16)
+    p0_q15          = np.round(p0 * 2**15).astype(np.uint16)
+
+    printVector(f, quant_scales_q8, 'dred_quant_scales_q8', dtype='opus_uint16', static=False)
+    printVector(f, dead_zone_q10, 'dred_dead_zone_q10', dtype='opus_uint16', static=False)
+    printVector(f, r_q15, 'dred_r_q15', dtype='opus_uint16', static=False)
+    printVector(f, p0_q15, 'dred_p0_q15', dtype='opus_uint16', static=False)
+
+    fh.write(
+f"""
+extern const opus_uint16 dred_quant_scales_q8[{levels * N}];
+extern const opus_uint16 dred_dead_zone_q10[{levels * N}];
+extern const opus_uint16 dred_r_q15[{levels * N}];
+extern const opus_uint16 dred_p0_q15[{levels * N}];
+
+"""
+    )
+
+if __name__ == "__main__":
+
+    model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
+    model.load_weights(args.weights)
+
+
+
+
+    # encoder
+    encoder_dense_names = [
+        'enc_dense1',
+        'enc_dense3',
+        'enc_dense5',
+        'enc_dense7',
+        'enc_dense8',
+        'gdense1',
+        'gdense2'
+    ]
+
+    encoder_gru_names = [
+        'enc_dense2',
+        'enc_dense4',
+        'enc_dense6'
+    ]
+
+    encoder_conv1d_names = [
+        'bits_dense'
+    ]
+
+    source_fid = open("dred_rdovae_enc_data.c", 'w')
+    header_fid = open("dred_rdovae_enc_data.h", 'w')
+
+    start_header(header_fid, "dred_rdovae_enc_data.h")
+    start_source(source_fid, "dred_rdovae_enc_data.h", os.path.basename(args.weights))
+
+    header_fid.write(
+f"""
+#include "dred_rdovae_constants.h"
+
+#include "nnet.h"
+"""
+    )
+
+    # dump GRUs
+    max_rnn_neurons_enc = max(
+        [
+            dump_gru_layer(encoder.get_layer(name), source_fid, header_fid, dotp=True, sparse=True)
+            for name in encoder_gru_names
+        ]
+    )
+
+    # dump conv layers
+    max_conv_inputs = max(
+        [
+            dump_conv1d_layer(encoder.get_layer(name), source_fid, header_fid)
+            for name in encoder_conv1d_names
+        ] 
+    )
+
+    # dump Dense layers
+    for name in encoder_dense_names:
+        layer = encoder.get_layer(name)
+        dump_dense_layer(layer, source_fid, header_fid)
+
+    # some global constants
+    header_fid.write(
+f"""
+
+#define DRED_ENC_MAX_RNN_NEURONS {max_rnn_neurons_enc}
+
+#define DRED_ENC_MAX_CONV_INPUTS {max_conv_inputs}
+
+"""
+    )
+
+    finish_header(header_fid)
+    finish_source(source_fid)
+
+    header_fid.close()
+    source_fid.close()
+
+    # statistical model
+    source_fid = open("dred_rdovae_stats_data.c", 'w')
+    header_fid = open("dred_rdovae_stats_data.h", 'w')
+
+    start_header(header_fid, "dred_rdovae_stats_data.h")
+    start_source(source_fid, "dred_rdovae_stats_data.h", os.path.basename(args.weights))
+
+    header_fid.write(
+"""
+
+#include "opus_types.h"
+
+"""
+    )
+
+    dump_statistical_model(qembedding, source_fid, header_fid)
+
+    finish_header(header_fid)
+    finish_source(source_fid)
+
+    header_fid.close()
+    source_fid.close()
+
+    # decoder
+    decoder_dense_names = [
+        'state1',
+        'state2',
+        'state3',
+        'dec_dense1',
+        'dec_dense3',
+        'dec_dense5',
+        'dec_dense7',
+        'dec_dense8',
+        'dec_final'
+    ]   
+
+    decoder_gru_names = [
+        'dec_dense2',
+        'dec_dense4',
+        'dec_dense6'
+    ] 
+
+    source_fid = open("dred_rdovae_dec_data.c", 'w')
+    header_fid = open("dred_rdovae_dec_data.h", 'w')
+
+    start_header(header_fid, "dred_rdovae_dec_data.h")
+    start_source(source_fid, "dred_rdovae_dec_data.h", os.path.basename(args.weights))
+
+    header_fid.write(
+f"""
+#include "dred_rdovae_constants.h"
+
+#include "nnet.h"
+"""
+    )
+
+
+    # dump GRUs
+    max_rnn_neurons_dec = max(
+        [
+            dump_gru_layer(decoder.get_layer(name), source_fid, header_fid, dotp=True, sparse=True)
+            for name in decoder_gru_names
+        ]
+    )
+
+    # dump Dense layers
+    for name in decoder_dense_names:
+        layer = decoder.get_layer(name)
+        dump_dense_layer(layer, source_fid, header_fid)
+
+    # some global constants
+    header_fid.write(
+f"""
+
+#define DRED_DEC_MAX_RNN_NEURONS {max_rnn_neurons_dec}
+
+"""
+    )
+
+    finish_header(header_fid)
+    finish_source(source_fid)
+
+    header_fid.close()
+    source_fid.close()
+
+    # common constants
+    header_fid = open("dred_rdovae_constants.h", 'w')
+    start_header(header_fid, "dred_rdovae_constants.h")
+
+    header_fid.write(
+f"""
+#define DRED_NUM_FEATURES 20
+
+#define DRED_LATENT_DIM {args.latent_dim}
+
+#define DRED_STATE_DIM {24}
+
+#define DRED_NUM_QUANTIZATION_LEVELS {qembedding.weights[0].shape[0]}
+
+#define DRED_MAX_RNN_NEURONS {max(max_rnn_neurons_enc, max_rnn_neurons_dec)}
+
+#define DRED_MAX_CONV_INPUTS {max_conv_inputs}
+"""
+    )
+
+    finish_header(header_fid)
+\ No newline at end of file
diff --git a/dnn/training_tf2/encode_rdovae.py b/dnn/training_tf2/encode_rdovae.py
new file mode 100644
index 00000000..f144fc48
--- /dev/null
+++ b/dnn/training_tf2/encode_rdovae.py
@@ -0,0 +1,125 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2018-2019 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+#from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a PLC model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')
+parser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--weights', metavar='<input weights>', help='model weights')
+parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')
+parser.add_argument('--batch-size', metavar='<batch size>', default=1, type=int, help='batch size to use (default 128)')
+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')
+
+
+args = parser.parse_args()
+
+import importlib
+rdovae = importlib.import_module(args.model)
+
+from rdovae import apply_dead_zone
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+from rdovae import pvq_quantize
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+model, encoder, decoder, qembedding = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+nb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
+features = features[:nb_sequences*sequence_size*nb_features]
+
+features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
+print(features.shape)
+features = features[:, :, :nb_used_features]
+#features = np.random.randn(73600, 1000, 17)
+
+
+bits, gru_state_dec = encoder.predict([features], batch_size=batch_size)
+(gru_state_dec).astype('float32').tofile(args.output + "-state.f32")
+
+
+#dist = rdovae.feat_dist_loss(features, quant_out)
+#rate = rdovae.sq1_rate_loss(features, model_bits)
+#rate2 = rdovae.sq_rate_metric(features, model_bits)
+#print(dist, rate, rate2)
+
+print("shapes are:")
+print(bits.shape)
+print(gru_state_dec.shape)
+
+features.astype('float32').tofile(args.output + "-input.f32")
+#quant_out.astype('float32').tofile(args.output + "-enc_dec.f32")
+nbits=80
+bits.astype('float32').tofile(args.output + "-syms.f32")
+
+lambda_val = 0.0002 * np.ones((nb_sequences, sequence_size//2, 1))
+quant_id = np.round(3.8*np.log(lambda_val/.0002)).astype('int16')
+quant_id = quant_id[:,:,0]
+quant_embed = qembedding(quant_id)
+quant_scale = tf.math.softplus(quant_embed[:,:,:nbits])
+dead_zone = tf.math.softplus(quant_embed[:, :, nbits : 2 * nbits])
+
+bits = bits*quant_scale
+bits = np.round(apply_dead_zone([bits, dead_zone]).numpy())
+bits = bits/quant_scale
+
+gru_state_dec = pvq_quantize(gru_state_dec, 82)
+#gru_state_dec = gru_state_dec/(1e-15+tf.norm(gru_state_dec, axis=-1,keepdims=True))
+gru_state_dec = gru_state_dec[:,-1,:]
+dec_out = decoder([bits[:,1::2,:], gru_state_dec])
+
+print(dec_out.shape)
+
+dec_out.numpy().astype('float32').tofile(args.output + "-quant_out.f32")
diff --git a/dnn/training_tf2/fec_encoder.py b/dnn/training_tf2/fec_encoder.py
new file mode 100644
index 00000000..95b7cc7a
--- /dev/null
+++ b/dnn/training_tf2/fec_encoder.py
@@ -0,0 +1,257 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe and Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+import os
+import subprocess
+import argparse
+
+
+import numpy as np
+from scipy.io import wavfile
+import tensorflow as tf
+
+from rdovae import new_rdovae_model, pvq_quantize, apply_dead_zone, sq_rate_metric
+from fec_packets import write_fec_packets, read_fec_packets
+
+
+debug = False
+
+if debug:
+    args = type('dummy', (object,),
+    {
+        'input' : 'item1.wav',
+        'weights' : 'testout/rdovae_alignment_fix_1024_120.h5',
+        'enc_lambda' : 0.0007,
+        'output' : "test_0007.fec",
+        'cond_size' : 1024,
+        'num_redundancy_frames' : 64,
+        'extra_delay' : 0,
+        'dump_data' : './dump_data'
+    })()
+    os.environ['CUDA_VISIBLE_DEVICES']=""
+else:
+    parser = argparse.ArgumentParser(description='Encode redundancy for Opus neural FEC. Designed for use with voip application and 20ms frames')
+
+    parser.add_argument('input', metavar='<input signal>', help='audio input (.wav or .raw or .pcm as int16)')
+    parser.add_argument('weights', metavar='<weights>', help='trained model file (.h5)')
+#    parser.add_argument('enc_lambda', metavar='<lambda>', type=float, help='lambda for controlling encoder rate')
+    parser.add_argument('output', type=str, help='output file (will be extended with .fec)')
+
+    parser.add_argument('--dump-data', type=str, default='./dump_data', help='path to dump data executable (default ./dump_data)')
+    parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')
+    parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 40)", default=40)
+    parser.add_argument('--num-redundancy-frames', default=64, type=int, help='number of redundancy frames (20ms) per packet (default 64)')
+    parser.add_argument('--extra-delay', default=0, type=int, help="last features in packet are calculated with the decoder aligned samples, use this option to add extra delay (in samples at 16kHz)")
+    parser.add_argument('--lossfile', type=str, help='file containing loss trace (0 for frame received, 1 for lost)')
+
+    parser.add_argument('--debug-output', action='store_true', help='if set, differently assembled features are written to disk')
+
+    args = parser.parse_args()
+
+model, encoder, decoder, qembedding = new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=1, nb_quant=args.quant_levels, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+
+## prepare input signal
+# SILK frame size is 20ms and LPCNet subframes are 10ms
+subframe_size = 160
+frame_size = 2 * subframe_size
+
+# 91 samples delay to align with SILK decoded frames
+silk_delay = 91
+
+# prepend zeros to have enough history to produce the first package
+zero_history = (args.num_redundancy_frames - 1) * frame_size
+
+# dump data has a (feature) delay of 10ms
+dump_data_delay = 160
+
+total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
+
+# load signal
+if args.input.endswith('.raw') or args.input.endswith('.pcm') or args.input.endswith('.sw'):
+    signal = np.fromfile(args.input, dtype='int16')
+    
+elif args.input.endswith('.wav'):
+    fs, signal = wavfile.read(args.input)
+else:
+    raise ValueError(f'unknown input signal format: {args.input}')
+
+# fill up last frame with zeros
+padded_signal_length = len(signal) + total_delay
+tail = padded_signal_length % frame_size
+right_padding = (frame_size - tail) % frame_size
+    
+signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
+
+padded_signal_file  = os.path.splitext(args.input)[0] + '_padded.raw'
+signal.tofile(padded_signal_file)
+
+# write signal and call dump_data to create features
+
+feature_file = os.path.splitext(args.input)[0] + '_features.f32'
+command = f"{args.dump_data} -test {padded_signal_file} {feature_file}"
+r = subprocess.run(command, shell=True)
+if r.returncode != 0:
+    raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")
+
+# load features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+
+# load features
+features = np.fromfile(feature_file, dtype='float32')
+num_subframes = len(features) // nb_features
+num_subframes = 2 * (num_subframes // 2)
+num_frames = num_subframes // 2
+
+features = np.reshape(features, (1, -1, nb_features))
+features = features[:, :, :nb_used_features]
+features = features[:, :num_subframes, :]
+
+#variable quantizer depending on the delay
+q0 = 3
+q1 = 15
+quant_id = np.round(q1 + (q0-q1)*np.arange(args.num_redundancy_frames//2)/args.num_redundancy_frames).astype('int16')
+#print(quant_id)
+
+quant_embed = qembedding(quant_id)
+
+# run encoder
+print("running fec encoder...")
+symbols, gru_state_dec = encoder.predict(features)
+
+# apply quantization
+nsymbols = 80
+quant_scale = tf.math.softplus(quant_embed[:, :nsymbols]).numpy()
+dead_zone = tf.math.softplus(quant_embed[:, nsymbols : 2 * nsymbols]).numpy()
+#symbols = apply_dead_zone([symbols, dead_zone]).numpy()
+#qsymbols = np.round(symbols)
+quant_gru_state_dec = pvq_quantize(gru_state_dec, 82)
+
+# rate estimate
+hard_distr_embed = tf.math.sigmoid(quant_embed[:, 4 * nsymbols : ]).numpy()
+#rate_input = np.concatenate((qsymbols, hard_distr_embed, enc_lambda), axis=-1)
+#rates = sq_rate_metric(None, rate_input, reduce=False).numpy()
+
+# run decoder
+input_length = args.num_redundancy_frames // 2
+offset = args.num_redundancy_frames - 1
+
+packets = []
+packet_sizes = []
+
+sym_batch = np.zeros((num_frames-offset, args.num_redundancy_frames//2, nsymbols), dtype='float32')
+quant_state = quant_gru_state_dec[0, offset:num_frames, :]
+#pack symbols for batch processing
+for i in range(offset, num_frames):
+    sym_batch[i-offset, :, :] = symbols[0, i - 2 * input_length + 2 : i + 1 : 2, :]
+
+#quantize symbols
+sym_batch = sym_batch * quant_scale
+sym_batch = apply_dead_zone([sym_batch, dead_zone]).numpy()
+sym_batch = np.round(sym_batch)
+
+hard_distr_embed = np.broadcast_to(hard_distr_embed, (sym_batch.shape[0], sym_batch.shape[1], 2*sym_batch.shape[2]))
+fake_lambda = np.ones((sym_batch.shape[0], sym_batch.shape[1], 1), dtype='float32')
+rate_input = np.concatenate((sym_batch, hard_distr_embed, fake_lambda), axis=-1)
+rates = sq_rate_metric(None, rate_input, reduce=False).numpy()
+#print(rates.shape)
+print("average rate = ", np.mean(rates[args.num_redundancy_frames:,:]))
+
+#sym_batch.tofile('qsyms.f32')
+
+sym_batch = sym_batch / quant_scale
+#print(sym_batch.shape, quant_state.shape)
+#features = decoder.predict([sym_batch, quant_state])
+features = decoder([sym_batch, quant_state])
+
+#for i in range(offset, num_frames):
+#    print(f"processing frame {i - offset}...")
+#    features = decoder.predict([qsymbols[:, i - 2 * input_length + 2 : i + 1 : 2, :], quant_embed_dec[:, i - 2 * input_length + 2 : i + 1 : 2, :], quant_gru_state_dec[:, i, :]])
+#    packets.append(features)
+#    packet_size = 8 * int((np.sum(rates[:, i - 2 * input_length + 2 : i + 1 : 2]) + 7) / 8) + 64
+#    packet_sizes.append(packet_size)
+
+
+# write packets
+packet_file = args.output + '.fec' if not args.output.endswith('.fec') else args.output
+#write_fec_packets(packet_file, packets, packet_sizes)
+
+
+#print(f"average redundancy rate: {int(round(sum(packet_sizes) / len(packet_sizes) * 50 / 1000))} kbps")
+
+if args.lossfile != None:
+    loss = np.loadtxt(args.lossfile, dtype='int16')
+    fec_out = np.zeros((features.shape[0]*2, features.shape[-1]), dtype='float32')
+    foffset = -2
+    ptr = 0;
+    count = 2;
+    for i in range(features.shape[0]):
+        if (loss[i] == 0) or (i == features.shape[0]-1):
+            fec_out[ptr:ptr+count,:] = features[i, foffset:, :]
+            #print("filled ", count)
+            foffset = -2
+            ptr = ptr+count
+            count = 2
+        else:
+            count = count + 2
+            foffset = foffset - 2
+
+    fec_out_full = np.zeros((fec_out.shape[0], nb_features), dtype=np.float32)
+    fec_out_full[:, :nb_used_features] = fec_out
+
+    fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
+    
+
+#create packets array like in the original version for debugging purposes
+for i in range(offset, num_frames):
+    packets.append(features[i-offset:i-offset+1, :, :])
+
+if args.debug_output:
+    import itertools
+
+    #batches = [2, 4]
+    batches = [4]
+    #offsets = [0, 4, 20]
+    offsets = [0, (args.num_redundancy_frames - 2)*2]
+    # sanity checks
+    # 1. concatenate features at offset 0
+    for batch, offset in itertools.product(batches, offsets):
+
+        stop = packets[0].shape[1] - offset
+        print(batch, offset, stop)
+        test_features = np.concatenate([packet[:,stop - batch: stop, :] for packet in packets[::batch//2]], axis=1)
+
+        test_features_full = np.zeros((test_features.shape[1], nb_features), dtype=np.float32)
+        test_features_full[:, :nb_used_features] = test_features[0, :, :]
+
+        print(f"writing debug output {packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32'}")
+        test_features_full.tofile(packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32')
+
diff --git a/dnn/training_tf2/fec_packets.c b/dnn/training_tf2/fec_packets.c
new file mode 100644
index 00000000..376fb4f1
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.c
@@ -0,0 +1,142 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "fec_packets.h"
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index)
+{
+
+    int16_t version;
+    int16_t header_size;
+    int16_t num_packets;
+    int16_t packet_size;
+    int16_t subframe_size;
+    int16_t subframes_per_packet;
+    int16_t num_features;
+    long offset;
+
+    FILE *fid = fopen(filename, "rb");
+    
+    /* read header */
+    if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+    if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+    if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+    if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+    if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+    if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+    if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+    /* check if indices are valid */
+    if (packet_index >= num_packets || subframe_index >= subframes_per_packet)
+    {
+        fprintf(stderr, "get_fec_frame: index out of bounds\n");
+        goto error;
+    }
+
+    /* calculate offset in file (+ 2 is for rate) */
+    offset = header_size + packet_index * packet_size + 2 + subframe_index * subframe_size;
+    fseek(fid, offset, SEEK_SET);
+
+    /* read features */
+    if (fread(features, sizeof(*features), num_features, fid) != num_features) goto error;
+
+    fclose(fid);
+    return 0;
+
+error:
+    fclose(fid);
+    return 1;
+}
+
+int get_fec_rate(const char * const filename, int packet_index)
+{
+    int16_t version;
+    int16_t header_size;
+    int16_t num_packets;
+    int16_t packet_size;
+    int16_t subframe_size;
+    int16_t subframes_per_packet;
+    int16_t num_features;
+    long offset;
+    int16_t rate;
+
+    FILE *fid = fopen(filename, "rb");
+    
+    /* read header */
+    if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+    if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+    if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+    if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+    if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+    if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+    if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+    /* check if indices are valid */
+    if (packet_index >= num_packets)
+    {
+        fprintf(stderr, "get_fec_rate: index out of bounds\n");
+        goto error;
+    }
+
+    /* calculate offset in file (+ 2 is for rate) */
+    offset = header_size + packet_index * packet_size;
+    fseek(fid, offset, SEEK_SET);
+
+    /* read rate */
+    if (fread(&rate, sizeof(rate), 1, fid) != 1) goto error;
+
+    fclose(fid);
+    return (int) rate;
+
+error:
+    fclose(fid);
+    return -1;
+}
+
+#if 0
+int main()
+{
+    float features[20];
+    int i;
+
+    if (get_fec_frame("../test.fec", &features[0], 0, 127))
+    {
+        return 1;
+    }
+
+    for (i = 0; i < 20; i ++)
+    {
+        printf("%d %f\n", i, features[i]);
+    }
+
+    printf("rate: %d\n", get_fec_rate("../test.fec", 0));
+
+}
+#endif
+\ No newline at end of file
diff --git a/dnn/training_tf2/fec_packets.h b/dnn/training_tf2/fec_packets.h
new file mode 100644
index 00000000..35d35542
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FEC_PACKETS_H
+#define _FEC_PACKETS_H
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index);
+int get_fec_rate(const char * const filename, int packet_index);
+
+#endif
+\ No newline at end of file
diff --git a/dnn/training_tf2/fec_packets.py b/dnn/training_tf2/fec_packets.py
new file mode 100644
index 00000000..5cd9201a
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.py
@@ -0,0 +1,108 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import numpy as np
+
+
+
+def write_fec_packets(filename, packets, rates=None):
+    """ writes packets in binary format """
+    
+    assert np.dtype(np.float32).itemsize == 4
+    assert np.dtype(np.int16).itemsize == 2
+    
+    # derive some sizes 
+    num_packets             = len(packets)
+    subframes_per_packet    = packets[0].shape[-2]
+    num_features            = packets[0].shape[-1]
+    
+    # size of float is 4
+    subframe_size           = num_features * 4
+    packet_size             = subframe_size * subframes_per_packet + 2 # two bytes for rate
+    
+    version = 1
+    # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
+    header_size = 14
+    
+    with open(filename, 'wb') as f:
+        
+        # header
+        f.write(np.int16(version).tobytes())
+        f.write(np.int16(header_size).tobytes())
+        f.write(np.int16(num_packets).tobytes())
+        f.write(np.int16(packet_size).tobytes())
+        f.write(np.int16(subframe_size).tobytes())
+        f.write(np.int16(subframes_per_packet).tobytes())
+        f.write(np.int16(num_features).tobytes())
+        
+        # packets
+        for i, packet in enumerate(packets):
+            if type(rates) == type(None):
+                rate = 0
+            else:
+                rate = rates[i]
+            
+            f.write(np.int16(rate).tobytes())
+            
+            features = np.flip(packet, axis=-2)
+            f.write(features.astype(np.float32).tobytes())
+            
+        
+def read_fec_packets(filename):
+    """ reads packets from binary format """
+    
+    assert np.dtype(np.float32).itemsize == 4
+    assert np.dtype(np.int16).itemsize == 2
+    
+    with open(filename, 'rb') as f:
+        
+        # header
+        version                 = np.frombuffer(f.read(2), dtype=np.int16).item()
+        header_size             = np.frombuffer(f.read(2), dtype=np.int16).item()
+        num_packets             = np.frombuffer(f.read(2), dtype=np.int16).item()
+        packet_size             = np.frombuffer(f.read(2), dtype=np.int16).item()
+        subframe_size           = np.frombuffer(f.read(2), dtype=np.int16).item()
+        subframes_per_packet    = np.frombuffer(f.read(2), dtype=np.int16).item()
+        num_features            = np.frombuffer(f.read(2), dtype=np.int16).item()
+        
+        dummy_features          = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32)
+        
+        # packets
+        rates = []
+        packets = []
+        for i in range(num_packets):
+                     
+            rate = np.frombuffer(f.read(2), dtype=np.int16).item
+            rates.append(rate)
+            
+            features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
+            packet = np.flip(features, axis=-2)
+            packets.append(packet)
+            
+    return packets
+\ No newline at end of file
diff --git a/dnn/training_tf2/keraslayerdump.py b/dnn/training_tf2/keraslayerdump.py
new file mode 100644
index 00000000..5abc1488
--- /dev/null
+++ b/dnn/training_tf2/keraslayerdump.py
@@ -0,0 +1,189 @@
+'''Copyright (c) 2017-2018 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+""" helper functions for dumping some Keras layers to C files """
+
+import numpy as np
+
+
+def printVector(f, vector, name, dtype='float', dotp=False, static=True):
+    """ prints vector as one-dimensional C array """
+    if dotp:
+        vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+        vector = vector.transpose((2, 0, 3, 1))
+    v = np.reshape(vector, (-1))
+    if static:
+        f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))
+    else:
+        f.write('const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))
+    for i in range(0, len(v)):
+        f.write('{}'.format(v[i]))
+        if (i!=len(v)-1):
+            f.write(',')
+        else:
+            break;
+        if (i%8==7):
+            f.write("\n   ")
+        else:
+            f.write(" ")
+    f.write('\n};\n\n')
+    return vector
+
+def printSparseVector(f, A, name, have_diag=True):
+    N = A.shape[0]
+    M = A.shape[1]
+    W = np.zeros((0,), dtype='int')
+    W0 = np.zeros((0,))
+    if have_diag:
+        diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
+        A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
+        A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
+        A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+        printVector(f, diag, name + '_diag')
+    AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')
+    idx = np.zeros((0,), dtype='int')
+    for i in range(M//8):
+        pos = idx.shape[0]
+        idx = np.append(idx, -1)
+        nb_nonzero = 0
+        for j in range(N//4):
+            block = A[j*4:(j+1)*4, i*8:(i+1)*8]
+            qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]
+            if np.sum(np.abs(block)) > 1e-10:
+                nb_nonzero = nb_nonzero + 1
+                idx = np.append(idx, j*4)
+                vblock = qblock.transpose((1,0)).reshape((-1,))
+                W0 = np.concatenate([W0, block.reshape((-1,))])
+                W = np.concatenate([W, vblock])
+        idx[pos] = nb_nonzero
+    f.write('#ifdef DOT_PROD\n')
+    printVector(f, W, name, dtype='qweight')
+    f.write('#else /*DOT_PROD*/\n')
+    printVector(f, W0, name, dtype='qweight')
+    f.write('#endif /*DOT_PROD*/\n')
+    printVector(f, idx, name + '_idx', dtype='int')
+    return AQ
+
+def dump_sparse_gru(self, f, hf):
+    name = 'sparse_' + self.name
+    print("printing layer " + name + " of type sparse " + self.__class__.__name__)
+    weights = self.get_weights()
+    qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')
+    printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)
+    printVector(f, subias, name + '_subias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = neurons
+    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'
+            .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('extern const SparseGRULayer {};\n\n'.format(name));
+    return max_rnn_neurons
+
+def dump_gru_layer(self, f, hf, dotp=False, sparse=False):
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    if sparse:
+        qweight = printSparseVector(f, weights[0], name + '_weights', have_diag=False)
+    else:
+        qweight = printVector(f, weights[0], name + '_weights')
+
+    if dotp:
+        f.write('#ifdef DOT_PROD\n')
+        qweight2 = np.clip(np.round(128.*weights[1]).astype('int'), -128, 127)
+        printVector(f, qweight2, name + '_recurrent_weights', dotp=True, dtype='qweight')
+        f.write('#else /*DOT_PROD*/\n')
+    else:
+        qweight2 = weights[1]
+
+    printVector(f, weights[1], name + '_recurrent_weights')
+    if dotp:
+        f.write('#endif /*DOT_PROD*/\n')
+
+    printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)
+    subias[1,:] = subias[1,:] - np.sum(qweight2*(1./128.),axis=0)
+    printVector(f, subias, name + '_subias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = neurons
+    f.write('const GRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_weights,\n   {},\n   {}_recurrent_weights,\n   {}, {}, ACTIVATION_{}, {}\n}};\n\n'
+            .format(name, name, name, name, name + "_weights_idx" if sparse else "NULL", name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('extern const GRULayer {};\n\n'.format(name));
+    return max_rnn_neurons
+
+def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
+    printVector(f, weights, name + '_weights')
+    printVector(f, bias, name + '_bias')
+    f.write('const DenseLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}, {}, ACTIVATION_{}\n}};\n\n'
+            .format(name, name, name, weights.shape[0], weights.shape[1], activation))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+    hf.write('extern const DenseLayer {};\n\n'.format(name));
+
+def dump_dense_layer(self, f, hf):
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    activation = self.activation.__name__.upper()
+    dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
+    return False
+
+def dump_conv1d_layer(self, f, hf):
+    name = self.name
+    print("printing layer " + name + " of type " + self.__class__.__name__)
+    weights = self.get_weights()
+    printVector(f, weights[0], name + '_weights')
+    printVector(f, weights[-1], name + '_bias')
+    activation = self.activation.__name__.upper()
+    max_conv_inputs = weights[0].shape[1]*weights[0].shape[0]
+    f.write('const Conv1DLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}, {}, {}, ACTIVATION_{}\n}};\n\n'
+            .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))
+    hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))
+    hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))
+    hf.write('extern const Conv1DLayer {};\n\n'.format(name));
+    return max_conv_inputs
diff --git a/dnn/training_tf2/lossfuncs.py b/dnn/training_tf2/lossfuncs.py
new file mode 100644
index 00000000..eb831764
--- /dev/null
+++ b/dnn/training_tf2/lossfuncs.py
@@ -0,0 +1,100 @@
+"""
+Custom Loss functions and metrics for training/analysis
+"""
+
+from tf_funcs import *
+import tensorflow as tf
+
+# The following loss functions all expect the lpcnet model to output the lpc prediction
+
+# Computing the excitation by subtracting the lpc prediction from the target, followed by minimizing the cross entropy
+def res_from_sigloss():
+    def loss(y_true,y_pred):
+        p = y_pred[:,:,0:1]
+        model_out = y_pred[:,:,2:]
+        e_gt = tf_l2u(y_true - p)
+        e_gt = tf.round(e_gt)
+        e_gt = tf.cast(e_gt,'int32')
+        sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out)
+        return sparse_cel
+    return loss
+
+# Interpolated and Compensated Loss (In case of end to end lpcnet)
+# Interpolates between adjacent embeddings based on the fractional value of the excitation computed (similar to the embedding interpolation)
+# Also adds a probability compensation (to account for matching cross entropy in the linear domain), weighted by gamma
+def interp_mulaw(gamma = 1):
+    def loss(y_true,y_pred):
+        y_true = tf.cast(y_true, 'float32')
+        p = y_pred[:,:,0:1]
+        real_p = y_pred[:,:,1:2]
+        model_out = y_pred[:,:,2:]
+        e_gt = tf_l2u(y_true - p)
+        exc_gt = tf_l2u(y_true - real_p)
+        prob_compensation = tf.squeeze((K.abs(e_gt - 128)/128.0)*K.log(256.0))
+        regularization = tf.squeeze((K.abs(exc_gt - 128)/128.0)*K.log(256.0))
+        alpha = e_gt - tf.math.floor(e_gt)
+        alpha = tf.tile(alpha,[1,1,256])
+        e_gt = tf.cast(e_gt,'int32')
+        e_gt = tf.clip_by_value(e_gt,0,254) 
+        interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
+        sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
+        loss_mod = sparse_cel + prob_compensation + gamma*regularization
+        return loss_mod
+    return loss
+
+# Same as above, except a metric
+def metric_oginterploss(y_true,y_pred):
+    p = y_pred[:,:,0:1]
+    model_out = y_pred[:,:,2:]
+    e_gt = tf_l2u(y_true - p)
+    prob_compensation = tf.squeeze((K.abs(e_gt - 128)/128.0)*K.log(256.0))
+    alpha = e_gt - tf.math.floor(e_gt)
+    alpha = tf.tile(alpha,[1,1,256])
+    e_gt = tf.cast(e_gt,'int32')
+    e_gt = tf.clip_by_value(e_gt,0,254) 
+    interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
+    sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
+    loss_mod = sparse_cel + prob_compensation
+    return loss_mod
+
+# Interpolated cross entropy loss metric
+def metric_icel(y_true, y_pred):
+    p = y_pred[:,:,0:1]
+    model_out = y_pred[:,:,2:]
+    e_gt = tf_l2u(y_true - p)
+    alpha = e_gt - tf.math.floor(e_gt)
+    alpha = tf.tile(alpha,[1,1,256])
+    e_gt = tf.cast(e_gt,'int32')
+    e_gt = tf.clip_by_value(e_gt,0,254) #Check direction
+    interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
+    sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
+    return sparse_cel
+
+# Non-interpolated (rounded) cross entropy loss metric
+def metric_cel(y_true, y_pred):
+    y_true = tf.cast(y_true, 'float32')
+    p = y_pred[:,:,0:1]
+    model_out = y_pred[:,:,2:]
+    e_gt = tf_l2u(y_true - p)
+    e_gt = tf.round(e_gt)
+    e_gt = tf.cast(e_gt,'int32')
+    e_gt = tf.clip_by_value(e_gt,0,255) 
+    sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out)
+    return sparse_cel
+
+# Variance metric of the output excitation
+def metric_exc_sd(y_true,y_pred):
+    p = y_pred[:,:,0:1]
+    e_gt = tf_l2u(y_true - p)
+    sd_egt = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(e_gt,128)
+    return sd_egt
+
+def loss_matchlar():
+    def loss(y_true,y_pred):
+        model_rc = y_pred[:,:,:16]
+        #y_true = lpc2rc(y_true)
+        loss_lar_diff = K.log((1.01 + model_rc)/(1.01 - model_rc)) - K.log((1.01 + y_true)/(1.01 - y_true))
+        loss_lar_diff = tf.square(loss_lar_diff)
+        return tf.reduce_mean(loss_lar_diff, axis=-1)
+    return loss
+
diff --git a/dnn/training_tf2/lpcnet.py b/dnn/training_tf2/lpcnet.py
new file mode 100644
index 00000000..b7cee77f
--- /dev/null
+++ b/dnn/training_tf2/lpcnet.py
@@ -0,0 +1,339 @@
+#!/usr/bin/python3
+'''Copyright (c) 2018 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import math
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation, GaussianNoise
+from tensorflow.compat.v1.keras.layers import CuDNNGRU
+from tensorflow.keras import backend as K
+from tensorflow.keras.constraints import Constraint
+from tensorflow.keras.initializers import Initializer
+from tensorflow.keras.callbacks import Callback
+from mdense import MDense
+import numpy as np
+import h5py
+import sys
+from tf_funcs import *
+from diffembed import diff_Embed
+from parameters import set_parameter
+
+frame_size = 160
+pcm_bits = 8
+embed_size = 128
+pcm_levels = 2**pcm_bits
+
+def interleave(p, samples):
+    p2=tf.expand_dims(p, 3)
+    nb_repeats = pcm_levels//(2*p.shape[2])
+    p3 = tf.reshape(tf.repeat(tf.concat([1-p2, p2], 3), nb_repeats), (-1, samples, pcm_levels))
+    return p3
+
+def tree_to_pdf(p, samples):
+    return interleave(p[:,:,1:2], samples) * interleave(p[:,:,2:4], samples) * interleave(p[:,:,4:8], samples) * interleave(p[:,:,8:16], samples) \
+         * interleave(p[:,:,16:32], samples) * interleave(p[:,:,32:64], samples) * interleave(p[:,:,64:128], samples) * interleave(p[:,:,128:256], samples)
+
+def tree_to_pdf_train(p):
+    #FIXME: try not to hardcode the 2400 samples (15 frames * 160 samples/frame)
+    return tree_to_pdf(p, 2400)
+
+def tree_to_pdf_infer(p):
+    return tree_to_pdf(p, 1)
+
+def quant_regularizer(x):
+    Q = 128
+    Q_1 = 1./Q
+    #return .01 * tf.reduce_mean(1 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))
+    return .01 * tf.reduce_mean(K.sqrt(K.sqrt(1.0001 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))))
+
+class Sparsify(Callback):
+    def __init__(self, t_start, t_end, interval, density, quantize=False):
+        super(Sparsify, self).__init__()
+        self.batch = 0
+        self.t_start = t_start
+        self.t_end = t_end
+        self.interval = interval
+        self.final_density = density
+        self.quantize = quantize
+
+    def on_batch_end(self, batch, logs=None):
+        #print("batch number", self.batch)
+        self.batch += 1
+        if self.quantize or (self.batch > self.t_start and (self.batch-self.t_start) % self.interval == 0) or self.batch >= self.t_end:
+            #print("constrain");
+            layer = self.model.get_layer('gru_a')
+            w = layer.get_weights()
+            p = w[1]
+            nb = p.shape[1]//p.shape[0]
+            N = p.shape[0]
+            #print("nb = ", nb, ", N = ", N);
+            #print(p.shape)
+            #print ("density = ", density)
+            for k in range(nb):
+                density = self.final_density[k]
+                if self.batch < self.t_end and not self.quantize:
+                    r = 1 - (self.batch-self.t_start)/(self.t_end - self.t_start)
+                    density = 1 - (1-self.final_density[k])*(1 - r*r*r)
+                A = p[:, k*N:(k+1)*N]
+                A = A - np.diag(np.diag(A))
+                #This is needed because of the CuDNNGRU strange weight ordering
+                A = np.transpose(A, (1, 0))
+                L=np.reshape(A, (N//4, 4, N//8, 8))
+                S=np.sum(L*L, axis=-1)
+                S=np.sum(S, axis=1)
+                SS=np.sort(np.reshape(S, (-1,)))
+                thresh = SS[round(N*N//32*(1-density))]
+                mask = (S>=thresh).astype('float32')
+                mask = np.repeat(mask, 4, axis=0)
+                mask = np.repeat(mask, 8, axis=1)
+                mask = np.minimum(1, mask + np.diag(np.ones((N,))))
+                #This is needed because of the CuDNNGRU strange weight ordering
+                mask = np.transpose(mask, (1, 0))
+                p[:, k*N:(k+1)*N] = p[:, k*N:(k+1)*N]*mask
+                #print(thresh, np.mean(mask))
+            if self.quantize and ((self.batch > self.t_start and (self.batch-self.t_start) % self.interval == 0) or self.batch >= self.t_end):
+                if self.batch < self.t_end:
+                    threshold = .5*(self.batch - self.t_start)/(self.t_end - self.t_start)
+                else:
+                    threshold = .5
+                quant = np.round(p*128.)
+                res = p*128.-quant
+                mask = (np.abs(res) <= threshold).astype('float32')
+                p = mask/128.*quant + (1-mask)*p
+
+            w[1] = p
+            layer.set_weights(w)
+
+class SparsifyGRUB(Callback):
+    def __init__(self, t_start, t_end, interval, grua_units, density, quantize=False):
+        super(SparsifyGRUB, self).__init__()
+        self.batch = 0
+        self.t_start = t_start
+        self.t_end = t_end
+        self.interval = interval
+        self.final_density = density
+        self.grua_units = grua_units
+        self.quantize = quantize
+
+    def on_batch_end(self, batch, logs=None):
+        #print("batch number", self.batch)
+        self.batch += 1
+        if self.quantize or (self.batch > self.t_start and (self.batch-self.t_start) % self.interval == 0) or self.batch >= self.t_end:
+            #print("constrain");
+            layer = self.model.get_layer('gru_b')
+            w = layer.get_weights()
+            p = w[0]
+            N = p.shape[0]
+            M = p.shape[1]//3
+            for k in range(3):
+                density = self.final_density[k]
+                if self.batch < self.t_end and not self.quantize:
+                    r = 1 - (self.batch-self.t_start)/(self.t_end - self.t_start)
+                    density = 1 - (1-self.final_density[k])*(1 - r*r*r)
+                A = p[:, k*M:(k+1)*M]
+                #This is needed because of the CuDNNGRU strange weight ordering
+                A = np.reshape(A, (M, N))
+                A = np.transpose(A, (1, 0))
+                N2 = self.grua_units
+                A2 = A[:N2, :]
+                L=np.reshape(A2, (N2//4, 4, M//8, 8))
+                S=np.sum(L*L, axis=-1)
+                S=np.sum(S, axis=1)
+                SS=np.sort(np.reshape(S, (-1,)))
+                thresh = SS[round(M*N2//32*(1-density))]
+                mask = (S>=thresh).astype('float32')
+                mask = np.repeat(mask, 4, axis=0)
+                mask = np.repeat(mask, 8, axis=1)
+                A = np.concatenate([A2*mask, A[N2:,:]], axis=0)
+                #This is needed because of the CuDNNGRU strange weight ordering
+                A = np.transpose(A, (1, 0))
+                A = np.reshape(A, (N, M))
+                p[:, k*M:(k+1)*M] = A
+                #print(thresh, np.mean(mask))
+            if self.quantize and ((self.batch > self.t_start and (self.batch-self.t_start) % self.interval == 0) or self.batch >= self.t_end):
+                if self.batch < self.t_end:
+                    threshold = .5*(self.batch - self.t_start)/(self.t_end - self.t_start)
+                else:
+                    threshold = .5
+                quant = np.round(p*128.)
+                res = p*128.-quant
+                mask = (np.abs(res) <= threshold).astype('float32')
+                p = mask/128.*quant + (1-mask)*p
+
+            w[0] = p
+            layer.set_weights(w)
+            
+
+class PCMInit(Initializer):
+    def __init__(self, gain=.1, seed=None):
+        self.gain = gain
+        self.seed = seed
+
+    def __call__(self, shape, dtype=None):
+        num_rows = 1
+        for dim in shape[:-1]:
+            num_rows *= dim
+        num_cols = shape[-1]
+        flat_shape = (num_rows, num_cols)
+        if self.seed is not None:
+            np.random.seed(self.seed)
+        a = np.random.uniform(-1.7321, 1.7321, flat_shape)
+        #a[:,0] = math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows
+        #a[:,1] = .5*a[:,0]*a[:,0]*a[:,0]
+        a = a + np.reshape(math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows, (num_rows, 1))
+        return self.gain * a.astype("float32")
+
+    def get_config(self):
+        return {
+            'gain': self.gain,
+            'seed': self.seed
+        }
+
+class WeightClip(Constraint):
+    '''Clips the weights incident to each hidden unit to be inside a range
+    '''
+    def __init__(self, c=2):
+        self.c = c
+
+    def __call__(self, p):
+        # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of
+        # saturation when implementing dot products with SSSE3 or AVX2.
+        return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))
+        #return K.clip(p, -self.c, self.c)
+
+    def get_config(self):
+        return {'name': self.__class__.__name__,
+            'c': self.c}
+
+constraint = WeightClip(0.992)
+
+def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, flag_e2e = False, cond_size=128, lpc_order=16, lpc_gamma=1., lookahead=2):
+    pcm = Input(shape=(None, 1), batch_size=batch_size)
+    dpcm = Input(shape=(None, 3), batch_size=batch_size)
+    feat = Input(shape=(None, nb_used_features), batch_size=batch_size)
+    pitch = Input(shape=(None, 1), batch_size=batch_size)
+    dec_feat = Input(shape=(None, cond_size))
+    dec_state1 = Input(shape=(rnn_units1,))
+    dec_state2 = Input(shape=(rnn_units2,))
+
+    padding = 'valid' if training else 'same'
+    fconv1 = Conv1D(cond_size, 3, padding=padding, activation='tanh', name='feature_conv1')
+    fconv2 = Conv1D(cond_size, 3, padding=padding, activation='tanh', name='feature_conv2')
+    pembed = Embedding(256, 64, name='embed_pitch')
+    cat_feat = Concatenate()([feat, Reshape((-1, 64))(pembed(pitch))])
+
+    cfeat = fconv2(fconv1(cat_feat))
+
+    fdense1 = Dense(cond_size, activation='tanh', name='feature_dense1')
+    fdense2 = Dense(cond_size, activation='tanh', name='feature_dense2')
+
+    if flag_e2e and quantize:
+        fconv1.trainable = False
+        fconv2.trainable = False
+        fdense1.trainable = False
+        fdense2.trainable = False
+
+    cfeat = fdense2(fdense1(cfeat))
+
+    error_calc = Lambda(lambda x: tf_l2u(x[0] - tf.roll(x[1],1,axis = 1)))
+    if flag_e2e:
+        lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat)
+    else:
+        lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size)
+        
+    real_preds = diff_pred(name = "real_lpc2preds")([pcm,lpcoeffs])
+    weighting = lpc_gamma ** np.arange(1, 17).astype('float32')
+    weighted_lpcoeffs = Lambda(lambda x: x[0]*x[1])([lpcoeffs, weighting])
+    tensor_preds = diff_pred(name = "lpc2preds")([pcm,weighted_lpcoeffs])
+    past_errors = error_calc([pcm,tensor_preds])
+    
+    embed = diff_Embed(name='embed_sig',initializer = PCMInit())
+    cpcm = Concatenate()([tf_l2u(pcm),tf_l2u(tensor_preds),past_errors])
+    cpcm = GaussianNoise(.3)(cpcm)
+    cpcm = Reshape((-1, embed_size*3))(embed(cpcm))
+    cpcm_decoder = Reshape((-1, embed_size*3))(embed(dpcm))
+
+    
+    rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
+
+    quant = quant_regularizer if quantize else None
+
+    if training:
+        rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True, name='gru_a', stateful=True,
+              recurrent_constraint = constraint, recurrent_regularizer=quant)
+        rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True, name='gru_b', stateful=True,
+               kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+    else:
+        rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a', stateful=True,
+              recurrent_constraint = constraint, recurrent_regularizer=quant)
+        rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b', stateful=True,
+               kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+
+    rnn_in = Concatenate()([cpcm, rep(cfeat)])
+    md = MDense(pcm_levels, activation='sigmoid', name='dual_fc')
+    gru_out1, _ = rnn(rnn_in)
+    gru_out1 = GaussianNoise(.005)(gru_out1)
+    gru_out2, _ = rnn2(Concatenate()([gru_out1, rep(cfeat)]))
+    ulaw_prob = Lambda(tree_to_pdf_train)(md(gru_out2))
+
+    if adaptation:
+        rnn.trainable=False
+        rnn2.trainable=False
+        md.trainable=False
+        embed.Trainable=False
+    
+    m_out = Concatenate(name='pdf')([tensor_preds,real_preds,ulaw_prob])
+    if not flag_e2e:
+        model = Model([pcm, feat, pitch, lpcoeffs], m_out)
+    else:
+        model = Model([pcm, feat, pitch], [m_out, cfeat])
+    model.rnn_units1 = rnn_units1
+    model.rnn_units2 = rnn_units2
+    model.nb_used_features = nb_used_features
+    model.frame_size = frame_size
+    
+    if not flag_e2e:
+        encoder = Model([feat, pitch], cfeat)
+        dec_rnn_in = Concatenate()([cpcm_decoder, dec_feat])
+    else:
+        encoder = Model([feat, pitch], [cfeat,lpcoeffs])
+        dec_rnn_in = Concatenate()([cpcm_decoder, dec_feat])
+    dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1)
+    dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2)
+    dec_ulaw_prob = Lambda(tree_to_pdf_infer)(md(dec_gru_out2))
+
+    if flag_e2e:
+        decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
+    else:
+        decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
+    
+    # add parameters to model
+    set_parameter(model, 'lpc_gamma', lpc_gamma, dtype='float64')
+    set_parameter(model, 'flag_e2e', flag_e2e, dtype='bool')
+    set_parameter(model, 'lookahead', lookahead, dtype='int32')
+
+    return model, encoder, decoder
diff --git a/dnn/training_tf2/lpcnet_plc.py b/dnn/training_tf2/lpcnet_plc.py
new file mode 100644
index 00000000..9acea419
--- /dev/null
+++ b/dnn/training_tf2/lpcnet_plc.py
@@ -0,0 +1,101 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2018-2019 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import math
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation, GaussianNoise
+from tensorflow.compat.v1.keras.layers import CuDNNGRU
+from tensorflow.keras import backend as K
+from tensorflow.keras.constraints import Constraint
+from tensorflow.keras.initializers import Initializer
+from tensorflow.keras.callbacks import Callback
+import numpy as np
+
+def quant_regularizer(x):
+    Q = 128
+    Q_1 = 1./Q
+    #return .01 * tf.reduce_mean(1 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))
+    return .01 * tf.reduce_mean(K.sqrt(K.sqrt(1.0001 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))))
+
+
+class WeightClip(Constraint):
+    '''Clips the weights incident to each hidden unit to be inside a range
+    '''
+    def __init__(self, c=2):
+        self.c = c
+
+    def __call__(self, p):
+        # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of
+        # saturation when implementing dot products with SSSE3 or AVX2.
+        return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))
+        #return K.clip(p, -self.c, self.c)
+
+    def get_config(self):
+        return {'name': self.__class__.__name__,
+            'c': self.c}
+
+constraint = WeightClip(0.992)
+
+def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128):
+    feat = Input(shape=(None, nb_used_features+nb_burg_features), batch_size=batch_size)
+    lost = Input(shape=(None, 1), batch_size=batch_size)
+
+    fdense1 = Dense(cond_size, activation='tanh', name='plc_dense1')
+
+    cfeat = Concatenate()([feat, lost])
+    cfeat = fdense1(cfeat)
+    #cfeat = Conv1D(cond_size, 3, padding='causal', activation='tanh', name='plc_conv1')(cfeat)
+
+    quant = quant_regularizer if quantize else None
+
+    if training:
+        rnn = CuDNNGRU(rnn_units, return_sequences=True, return_state=True, name='plc_gru1', stateful=True,
+              kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+        rnn2 = CuDNNGRU(rnn_units, return_sequences=True, return_state=True, name='plc_gru2', stateful=True,
+              kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+    else:
+        rnn = GRU(rnn_units, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='plc_gru1', stateful=True,
+              kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+        rnn2 = GRU(rnn_units, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='plc_gru2', stateful=True,
+              kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+
+    gru_out1, _ = rnn(cfeat)
+    gru_out1 = GaussianNoise(.005)(gru_out1)
+    gru_out2, _ = rnn2(gru_out1)
+    
+    out_dense = Dense(nb_used_features, activation='linear', name='plc_out')
+    plc_out = out_dense(gru_out2)
+    
+    model = Model([feat, lost], plc_out)
+    model.rnn_units = rnn_units
+    model.cond_size = cond_size
+    model.nb_used_features = nb_used_features
+    model.nb_burg_features = nb_burg_features
+
+    return model
diff --git a/dnn/training_tf2/mdense.py b/dnn/training_tf2/mdense.py
new file mode 100644
index 00000000..5679dd29
--- /dev/null
+++ b/dnn/training_tf2/mdense.py
@@ -0,0 +1,95 @@
+from tensorflow.keras import backend as K
+from tensorflow.keras.layers import Layer, InputSpec
+from tensorflow.keras import activations
+from tensorflow.keras import initializers, regularizers, constraints
+import numpy as np
+import math
+
+class MDense(Layer):
+    
+    def __init__(self, outputs,
+                 channels=2,
+                 activation=None,
+                 use_bias=True,
+                 kernel_initializer='glorot_uniform',
+                 bias_initializer='zeros',
+                 kernel_regularizer=None,
+                 bias_regularizer=None,
+                 activity_regularizer=None,
+                 kernel_constraint=None,
+                 bias_constraint=None,
+                 **kwargs):
+        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
+            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
+        super(MDense, self).__init__(**kwargs)
+        self.units = outputs
+        self.channels = channels
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+        self.input_spec = InputSpec(min_ndim=2)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+        assert len(input_shape) >= 2
+        input_dim = input_shape[-1]
+
+        self.kernel = self.add_weight(shape=(self.units, input_dim, self.channels),
+                                      initializer=self.kernel_initializer,
+                                      name='kernel',
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units, self.channels),
+                                        initializer=self.bias_initializer,
+                                        name='bias',
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = None
+        self.factor = self.add_weight(shape=(self.units, self.channels),
+                                    initializer='ones',
+                                    name='factor',
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint)
+        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
+        self.built = True
+
+    def call(self, inputs):
+        output = K.dot(inputs, self.kernel)
+        if self.use_bias:
+            output = output + self.bias
+        output = K.tanh(output) * self.factor
+        output = K.sum(output, axis=-1)
+        if self.activation is not None:
+            output = self.activation(output)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        assert input_shape and len(input_shape) >= 2
+        assert input_shape[-1]
+        output_shape = list(input_shape)
+        output_shape[-1] = self.units
+        return tuple(output_shape)
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'activation': activations.serialize(self.activation),
+            'use_bias': self.use_bias,
+            'kernel_initializer': initializers.serialize(self.kernel_initializer),
+            'bias_initializer': initializers.serialize(self.bias_initializer),
+            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'kernel_constraint': constraints.serialize(self.kernel_constraint),
+            'bias_constraint': constraints.serialize(self.bias_constraint)
+        }
+        base_config = super(MDense, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/dnn/training_tf2/pade.py b/dnn/training_tf2/pade.py
new file mode 100644
index 00000000..f88f425c
--- /dev/null
+++ b/dnn/training_tf2/pade.py
@@ -0,0 +1,70 @@
+# Optimizing a rational function to optimize a tanh() approximation
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation
+import tensorflow.keras.backend as K
+from tensorflow.keras.optimizers import Adam, SGD
+
+def my_loss1(y_true, y_pred):
+    return 1*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)
+
+def my_loss2(y_true, y_pred):
+    return .1*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)
+
+def my_loss3(y_true, y_pred):
+    return .01*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)
+
+# Using these initializers to seed the approximation
+# with a reasonable starting point
+def num_init(shape, dtype=None):
+    rr = tf.constant([[945], [105], [1]], dtype=dtype)
+    #rr = tf.constant([[946.56757], [98.01368], [0.66841]], dtype=dtype)
+    print(rr)
+    return rr
+
+def den_init(shape, dtype=None):
+    rr = tf.constant([[945], [420], [15]], dtype=dtype)
+    #rr = tf.constant([[946.604], [413.342], [12.465]], dtype=dtype)
+    print(rr)
+    return rr
+
+
+x = np.arange(-10, 10, .01)
+N = len(x)
+x = np.reshape(x, (1, -1, 1))
+x2 = x*x
+
+x2in = np.concatenate([x2*0 + 1, x2, x2*x2], axis=2)
+yout = np.tanh(x)
+
+
+model_x = Input(shape=(None, 1,))
+model_x2 = Input(shape=(None, 3,))
+
+num = Dense(1, name='num', use_bias=False, kernel_initializer=num_init)
+den = Dense(1, name='den', use_bias=False, kernel_initializer=den_init)
+
+def ratio(x):
+    return tf.minimum(1., tf.maximum(-1., x[0]*x[1]/x[2]))
+
+out_layer = Lambda(ratio)
+output = out_layer([model_x, num(model_x2), den(model_x2)])
+
+model = Model([model_x, model_x2], output)
+model.summary()
+
+model.compile(Adam(0.05, beta_1=0.9, beta_2=0.9, decay=2e-5), loss='mean_squared_error')
+model.fit([x, x2in], yout, batch_size=1, epochs=500000, validation_split=0.0)
+
+model.compile(Adam(0.001, beta_2=0.9, decay=1e-4), loss=my_loss1)
+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)
+
+model.compile(Adam(0.0001, beta_2=0.9, decay=1e-4), loss=my_loss2)
+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)
+
+model.compile(Adam(0.00001, beta_2=0.9, decay=1e-4), loss=my_loss3)
+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)
+
+model.save_weights('tanh.h5')
diff --git a/dnn/training_tf2/parameters.py b/dnn/training_tf2/parameters.py
new file mode 100644
index 00000000..34b65480
--- /dev/null
+++ b/dnn/training_tf2/parameters.py
@@ -0,0 +1,29 @@
+""" module for handling extra model parameters for tf.keras models """
+
+import tensorflow as tf
+
+
+def set_parameter(model, parameter_name, parameter_value, dtype='float32'):
+    """ stores parameter_value as non-trainable weight with name parameter_name:0 """
+    
+    weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
+    
+    if len(weights) == 0:
+        model.add_weight(parameter_name, trainable=False, initializer=tf.keras.initializers.Constant(parameter_value), dtype=dtype)
+    elif len(weights) == 1:
+        weights[0].assign(parameter_value)
+    else:
+        raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
+
+        
+def get_parameter(model, parameter_name, default=None):
+    """ returns parameter value if parameter is present in model and otherwise default """
+    
+    weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
+    
+    if len(weights) == 0:
+        return default   
+    elif len(weights) > 1:
+        raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
+    else:
+        return weights[0].numpy().item()
diff --git a/dnn/training_tf2/plc_loader.py b/dnn/training_tf2/plc_loader.py
new file mode 100644
index 00000000..f9430d10
--- /dev/null
+++ b/dnn/training_tf2/plc_loader.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import numpy as np
+from tensorflow.keras.utils import Sequence
+
+class PLCLoader(Sequence):
+    def __init__(self, features, lost, nb_burg_features, batch_size):
+        self.batch_size = batch_size
+        self.nb_batches = features.shape[0]//self.batch_size
+        self.features = features[:self.nb_batches*self.batch_size, :, :]
+        self.lost = lost.astype('float')
+        self.lost = self.lost[:(len(self.lost)//features.shape[1]-1)*features.shape[1]]
+        self.nb_burg_features = nb_burg_features
+        self.on_epoch_end()
+
+    def on_epoch_end(self):
+        self.indices = np.arange(self.nb_batches*self.batch_size)
+        np.random.shuffle(self.indices)
+        offset = np.random.randint(0, high=self.features.shape[1])
+        self.lost_offset = np.reshape(self.lost[offset:-self.features.shape[1]+offset], (-1, self.features.shape[1]))
+        self.lost_indices = np.random.randint(0, high=self.lost_offset.shape[0], size=self.nb_batches*self.batch_size)
+
+    def __getitem__(self, index):
+        features = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]
+        burg_lost = (np.random.rand(features.shape[0], features.shape[1]) > .1).astype('float')
+        burg_lost = np.reshape(burg_lost, (features.shape[0], features.shape[1], 1))
+        burg_mask = np.tile(burg_lost, (1,1,self.nb_burg_features))
+
+        lost = self.lost_offset[self.lost_indices[index*self.batch_size:(index+1)*self.batch_size], :]
+        lost = np.reshape(lost, (features.shape[0], features.shape[1], 1))
+        lost_mask = np.tile(lost, (1,1,features.shape[2]))
+        in_features = features*lost_mask
+        in_features[:,:,:self.nb_burg_features] = in_features[:,:,:self.nb_burg_features]*burg_mask
+        
+        #For the first frame after a loss, we don't have valid features, but the Burg estimate is valid.
+        #in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
+        out_lost = np.copy(lost)
+        #out_lost[:,1:,:] = out_lost[:,1:,:]*out_lost[:,:-1,:]
+
+        out_features = np.concatenate([features[:,:,self.nb_burg_features:], 1.-out_lost], axis=-1)
+        burg_sign = 2*burg_lost - 1
+        # last dim is 1 for received packet, 0 for lost packet, and -1 when just the Burg info is missing
+        inputs = [in_features*lost_mask, lost*burg_sign]
+        outputs = [out_features]
+        return (inputs, outputs)
+
+    def __len__(self):
+        return self.nb_batches
diff --git a/dnn/training_tf2/rdovae.py b/dnn/training_tf2/rdovae.py
new file mode 100644
index 00000000..45b3efb0
--- /dev/null
+++ b/dnn/training_tf2/rdovae.py
@@ -0,0 +1,373 @@
+#!/usr/bin/python3
+'''Copyright (c) 2022 Amazon
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import math
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation, GaussianNoise, AveragePooling1D, RepeatVector
+from tensorflow.compat.v1.keras.layers import CuDNNGRU
+from tensorflow.keras import backend as K
+from tensorflow.keras.constraints import Constraint
+from tensorflow.keras.initializers import Initializer
+from tensorflow.keras.callbacks import Callback
+from tensorflow.keras.regularizers import l1
+import numpy as np
+import h5py
+from uniform_noise import UniformNoise
+
+class WeightClip(Constraint):
+    '''Clips the weights incident to each hidden unit to be inside a range
+    '''
+    def __init__(self, c=2):
+        self.c = c
+
+    def __call__(self, p):
+        # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of
+        # saturation when implementing dot products with SSSE3 or AVX2.
+        return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))
+        #return K.clip(p, -self.c, self.c)
+
+    def get_config(self):
+        return {'name': self.__class__.__name__,
+            'c': self.c}
+
+constraint = WeightClip(0.496)
+
+def soft_quantize(x):
+    #x = 4*x
+    #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
+    #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
+    #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)    
+    return x
+
+def noise_quantize(x):
+    return soft_quantize(x + (K.random_uniform((128, 16, 80))-.5) )
+
+def hard_quantize(x):
+    x = soft_quantize(x)
+    quantized = tf.round(x)
+    return x + tf.stop_gradient(quantized - x)
+
+def apply_dead_zone(x):
+    d = x[1]*.05
+    x = x[0]
+    y = x - d*tf.math.tanh(x/(.1+d))
+    return y
+
+def rate_loss(y_true,y_pred):
+    log2_e = 1.4427
+    n = y_pred.shape[-1]
+    C = n - log2_e*np.math.log(np.math.gamma(n))
+    k = K.sum(K.abs(y_pred), axis=-1)
+    p = 1.5
+    #rate = C + (n-1)*log2_e*tf.math.log((k**p + (n/5)**p)**(1/p))
+    rate = C + (n-1)*log2_e*tf.math.log(k + .112*n**2/(n/1.8+k) )
+    return K.mean(rate)
+
+eps=1e-6
+def safelog2(x):
+    log2_e = 1.4427
+    return log2_e*tf.math.log(eps+x)
+
+def feat_dist_loss(y_true,y_pred):
+    lambda_1 = 1./K.sqrt(y_pred[:,:,:,-1])
+    y_pred = y_pred[:,:,:,:-1]
+    ceps = y_pred[:,:,:,:18] - y_true[:,:,:18]
+    pitch = 2*(y_pred[:,:,:,18:19] - y_true[:,:,18:19])/(y_true[:,:,18:19] + 2)
+    corr = y_pred[:,:,:,19:] - y_true[:,:,19:]
+    pitch_weight = K.square(K.maximum(0., y_true[:,:,19:]+.5))
+    return K.mean(lambda_1*K.mean(K.square(ceps) + 10*(1/18.)*K.abs(pitch)*pitch_weight + (1/18.)*K.square(corr), axis=-1))
+
+def sq1_rate_loss(y_true,y_pred):
+    lambda_val = K.sqrt(y_pred[:,:,-1])
+    y_pred = y_pred[:,:,:-1]
+    log2_e = 1.4427
+    n = y_pred.shape[-1]//3
+    r = (y_pred[:,:,2*n:])
+    p0 = (y_pred[:,:,n:2*n])
+    p0 = 1-r**(.5+.5*p0)
+    y_pred = y_pred[:,:,:n]
+    y_pred = soft_quantize(y_pred)
+
+    y0 = K.maximum(0., 1. - K.abs(y_pred))**2
+    rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
+    rate = -safelog2(-.5*tf.math.log(r)*r**K.abs(y_pred))
+    rate = -safelog2((1-r)/(1+r)*r**K.abs(y_pred))
+    #rate = -safelog2(- tf.math.sinh(.5*tf.math.log(r))* r**K.abs(y_pred) - tf.math.cosh(K.maximum(0., .5 - K.abs(y_pred))*tf.math.log(r)) + 1)
+    rate = lambda_val*K.sum(rate, axis=-1)
+    return K.mean(rate)
+
+def sq2_rate_loss(y_true,y_pred):
+    lambda_val = K.sqrt(y_pred[:,:,-1])
+    y_pred = y_pred[:,:,:-1]
+    log2_e = 1.4427
+    n = y_pred.shape[-1]//3
+    r = y_pred[:,:,2*n:]
+    p0 = y_pred[:,:,n:2*n]
+    p0 = 1-r**(.5+.5*p0)
+    #theta = K.minimum(1., .5 + 0*p0 - 0.04*tf.math.log(r))
+    #p0 = 1-r**theta
+    y_pred = tf.round(y_pred[:,:,:n])
+    y0 = K.maximum(0., 1. - K.abs(y_pred))**2
+    rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
+    rate = lambda_val*K.sum(rate, axis=-1)
+    return K.mean(rate)
+
+def sq_rate_metric(y_true,y_pred, reduce=True):
+    y_pred = y_pred[:,:,:-1]
+    log2_e = 1.4427
+    n = y_pred.shape[-1]//3
+    r = y_pred[:,:,2*n:]
+    p0 = y_pred[:,:,n:2*n]
+    p0 = 1-r**(.5+.5*p0)
+    #theta = K.minimum(1., .5 + 0*p0 - 0.04*tf.math.log(r))
+    #p0 = 1-r**theta
+    y_pred = tf.round(y_pred[:,:,:n])
+    y0 = K.maximum(0., 1. - K.abs(y_pred))**2
+    rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
+    rate = K.sum(rate, axis=-1)
+    if reduce:
+        rate = K.mean(rate)
+    return rate
+
+def pvq_quant_search(x, k):
+    x = x/tf.reduce_sum(tf.abs(x), axis=-1, keepdims=True)
+    kx = k*x
+    y = tf.round(kx)
+    newk = k
+
+    for j in range(10):
+        #print("y = ", y)
+        #print("iteration ", j)
+        abs_y = tf.abs(y)
+        abs_kx = tf.abs(kx)
+        kk=tf.reduce_sum(abs_y, axis=-1)
+        #print("sums = ", kk)
+        plus = 1.000001*tf.reduce_min((abs_y+.5)/(abs_kx+1e-15), axis=-1)
+        minus = .999999*tf.reduce_max((abs_y-.5)/(abs_kx+1e-15), axis=-1)
+        #print("plus = ", plus)
+        #print("minus = ", minus)
+        factor = tf.where(kk>k, minus, plus)
+        factor = tf.where(kk==k, tf.ones_like(factor), factor)
+        #print("scale = ", factor)
+        factor = tf.expand_dims(factor, axis=-1)
+        #newk = newk * (k/kk)**.2
+        newk = newk*factor
+        kx = newk*x
+        #print("newk = ", newk)
+        #print("unquantized = ", newk*x)
+        y = tf.round(kx)
+
+    #print(y)
+    #print(K.mean(K.sum(K.abs(y), axis=-1)))
+    return y
+
+def pvq_quantize(x, k):
+    x = x/(1e-15+tf.norm(x, axis=-1,keepdims=True))
+    quantized = pvq_quant_search(x, k)
+    quantized = quantized/(1e-15+tf.norm(quantized, axis=-1,keepdims=True))
+    return x + tf.stop_gradient(quantized - x)
+
+
+def var_repeat(x):
+    return tf.repeat(tf.expand_dims(x[0], 1), K.shape(x[1])[1], axis=1)
+
+nb_state_dim = 24
+
+def new_rdovae_encoder(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batch_size=128, cond_size=128, cond_size2=256, training=False):
+    feat = Input(shape=(None, nb_used_features), batch_size=batch_size)
+
+    gru = CuDNNGRU if training else GRU
+    enc_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='enc_dense1')
+    enc_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='enc_dense2')
+    enc_dense3 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='enc_dense3')
+    enc_dense4 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='enc_dense4')
+    enc_dense5 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='enc_dense5')
+    enc_dense6 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='enc_dense6')
+    enc_dense7 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='enc_dense7')
+    enc_dense8 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='enc_dense8')
+
+    #bits_dense = Dense(nb_bits, activation='linear', name='bits_dense')
+    bits_dense = Conv1D(nb_bits, 4, padding='causal', activation='linear', name='bits_dense')
+
+    zero_out = Lambda(lambda x: 0*x)
+    inputs = Reshape((-1, 2*nb_used_features))(feat)
+    d1 = enc_dense1(inputs)
+    d2 = enc_dense2(d1)
+    d3 = enc_dense3(d2)
+    d4 = enc_dense4(d3)
+    d5 = enc_dense5(d4)
+    d6 = enc_dense6(d5)
+    d7 = enc_dense7(d6)
+    d8 = enc_dense8(d7)
+    pre_out = Concatenate()([d1, d2, d3, d4, d5, d6, d7, d8])
+    enc_out = bits_dense(pre_out)
+    global_dense1 = Dense(128, activation='tanh', name='gdense1')
+    global_dense2 = Dense(nb_state_dim, activation='tanh', name='gdense2')
+    global_bits = global_dense2(global_dense1(pre_out))
+
+    encoder = Model([feat], [enc_out, global_bits], name='encoder')
+    return encoder
+
+def new_rdovae_decoder(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batch_size=128, cond_size=128, cond_size2=256, training=False):
+    bits_input = Input(shape=(None, nb_bits), batch_size=batch_size, name="dec_bits")
+    gru_state_input = Input(shape=(nb_state_dim,), batch_size=batch_size, name="dec_state")
+
+    
+    gru = CuDNNGRU if training else GRU
+    dec_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense1')
+    dec_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense2')
+    dec_dense3 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense3')
+    dec_dense4 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense4')
+    dec_dense5 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense5')
+    dec_dense6 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense6')
+    dec_dense7 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='dec_dense7')
+    dec_dense8 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='dec_dense8')
+
+    dec_final = Dense(bunch*nb_used_features, activation='linear', name='dec_final')
+
+    time_reverse = Lambda(lambda x: K.reverse(x, 1))
+    #time_reverse = Lambda(lambda x: x)
+    #gru_state_rep = RepeatVector(64//bunch)(gru_state_input)
+
+    #gru_state_rep = Lambda(var_repeat, output_shape=(None, nb_state_dim)) ([gru_state_input, bits_input])
+    gru_state1 = Dense(cond_size, name="state1", activation='tanh')(gru_state_input)
+    gru_state2 = Dense(cond_size, name="state2", activation='tanh')(gru_state_input)
+    gru_state3 = Dense(cond_size, name="state3", activation='tanh')(gru_state_input)
+
+    dec1 = dec_dense1(time_reverse(bits_input))
+    dec2 = dec_dense2(dec1, initial_state=gru_state1)
+    dec3 = dec_dense3(dec2)
+    dec4 = dec_dense4(dec3, initial_state=gru_state2)
+    dec5 = dec_dense5(dec4)
+    dec6 = dec_dense6(dec5, initial_state=gru_state3)
+    dec7 = dec_dense7(dec6)
+    dec8 = dec_dense8(dec7)
+    output = Reshape((-1, nb_used_features))(dec_final(Concatenate()([dec1, dec2, dec3, dec4, dec5, dec6, dec7, dec8])))
+    decoder = Model([bits_input, gru_state_input], time_reverse(output), name='decoder')
+    decoder.nb_bits = nb_bits
+    decoder.bunch = bunch
+    return decoder
+
+def new_split_decoder(decoder):
+    nb_bits = decoder.nb_bits
+    bunch = decoder.bunch
+    bits_input = Input(shape=(None, nb_bits), name="split_bits")
+    gru_state_input = Input(shape=(None,nb_state_dim), name="split_state")
+
+    range_select = Lambda(lambda x: x[0][:,x[1]:x[2],:])
+    elem_select = Lambda(lambda x: x[0][:,x[1],:])
+    points = [0, 100, 200, 300, 400]
+    outputs = []
+    for i in range(len(points)-1):
+        begin = points[i]//bunch
+        end = points[i+1]//bunch
+        state = elem_select([gru_state_input, end-1])
+        bits = range_select([bits_input, begin, end])
+        outputs.append(decoder([bits, state]))
+    output = Concatenate(axis=1)(outputs)
+    split = Model([bits_input, gru_state_input], output, name="split")
+    return split
+
+def tensor_concat(x):
+    #n = x[1]//2
+    #x = x[0]
+    n=2
+    y = []
+    for i in range(n-1):
+        offset = 2 * (n-1-i)
+        tmp = K.concatenate([x[i][:, offset:, :], x[-1][:, -offset:, :]], axis=-2) 
+        y.append(tf.expand_dims(tmp, axis=0))
+    y.append(tf.expand_dims(x[-1], axis=0))
+    return Concatenate(axis=0)(y)
+
+
+def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batch_size=128, cond_size=128, cond_size2=256, training=False):
+
+    feat = Input(shape=(None, nb_used_features), batch_size=batch_size)
+    quant_id = Input(shape=(None,), batch_size=batch_size)
+    lambda_val = Input(shape=(None, 1), batch_size=batch_size)
+    lambda_bunched = AveragePooling1D(pool_size=bunch//2, strides=bunch//2, padding="valid")(lambda_val)
+    lambda_up = Lambda(lambda x: K.repeat_elements(x, 2, axis=-2))(lambda_val)
+
+    qembedding = Embedding(nb_quant, 6*nb_bits, name='quant_embed', embeddings_initializer='zeros')
+    quant_embed_dec = qembedding(quant_id)
+    quant_scale = Activation('softplus')(Lambda(lambda x: x[:,:,:nb_bits], name='quant_scale_embed')(quant_embed_dec))
+
+    encoder = new_rdovae_encoder(nb_used_features, nb_bits, bunch, nb_quant, batch_size, cond_size, cond_size2, training=training)
+    ze, gru_state_dec = encoder([feat])
+    ze = Multiply()([ze, quant_scale])
+
+    decoder = new_rdovae_decoder(nb_used_features, nb_bits, bunch, nb_quant, batch_size, cond_size, cond_size2, training=training)
+    split_decoder = new_split_decoder(decoder)
+
+    dead_zone = Activation('softplus')(Lambda(lambda x: x[:,:,nb_bits:2*nb_bits], name='dead_zone_embed')(quant_embed_dec))
+    soft_distr_embed = Activation('sigmoid')(Lambda(lambda x: x[:,:,2*nb_bits:4*nb_bits], name='soft_distr_embed')(quant_embed_dec))
+    hard_distr_embed = Activation('sigmoid')(Lambda(lambda x: x[:,:,4*nb_bits:], name='hard_distr_embed')(quant_embed_dec))
+
+    noisequant = UniformNoise()
+    hardquant = Lambda(hard_quantize)
+    dzone = Lambda(apply_dead_zone)
+    dze = dzone([ze,dead_zone])
+    ndze = noisequant(dze)
+    dze_quant = hardquant(dze)
+    
+    div = Lambda(lambda x: x[0]/x[1])
+    dze_quant = div([dze_quant,quant_scale])
+    ndze_unquant = div([ndze,quant_scale])
+
+    mod_select = Lambda(lambda x: x[0][:,x[1]::bunch//2,:])
+    gru_state_dec = Lambda(lambda x: pvq_quantize(x, 82))(gru_state_dec)
+    combined_output = []
+    unquantized_output = []
+    cat = Concatenate(name="out_cat")
+    for i in range(bunch//2):
+        dze_select = mod_select([dze_quant, i])
+        ndze_select = mod_select([ndze_unquant, i])
+        state_select = mod_select([gru_state_dec, i])
+
+        tmp = split_decoder([dze_select, state_select])
+        tmp = cat([tmp, lambda_up])
+        combined_output.append(tmp)
+
+        tmp = split_decoder([ndze_select, state_select])
+        tmp = cat([tmp, lambda_up])        
+        unquantized_output.append(tmp)
+
+    concat = Lambda(tensor_concat, name="output")
+    combined_output = concat(combined_output)
+    unquantized_output = concat(unquantized_output)
+    
+    e2 = Concatenate(name="hard_bits")([dze, hard_distr_embed, lambda_val])
+    e = Concatenate(name="soft_bits")([dze, soft_distr_embed, lambda_val])
+
+
+    model = Model([feat, quant_id, lambda_val], [combined_output, unquantized_output, e, e2], name="end2end")
+    model.nb_used_features = nb_used_features
+
+    return model, encoder, decoder, qembedding
+
diff --git a/dnn/training_tf2/rdovae_exchange.py b/dnn/training_tf2/rdovae_exchange.py
new file mode 100644
index 00000000..ecebf707
--- /dev/null
+++ b/dnn/training_tf2/rdovae_exchange.py
@@ -0,0 +1,138 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import argparse
+import os
+import sys
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('weights', metavar="<weight file>", type=str, help='model weight file in hdf5 format')
+parser.add_argument('output', metavar="<output folder>", type=str, help='output exchange folder')
+parser.add_argument('--cond-size', type=int, help="conditioning size (default: 256)", default=256)
+parser.add_argument('--latent-dim', type=int, help="dimension of latent space (default: 80)", default=80)
+parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 16)", default=16)
+
+args = parser.parse_args()
+
+# now import the heavy stuff
+from rdovae import new_rdovae_model
+from wexchange.tf import dump_tf_weights, load_tf_weights
+
+
+exchange_name = {
+    'enc_dense1'    : 'encoder_stack_layer1_dense',
+    'enc_dense3'    : 'encoder_stack_layer3_dense',
+    'enc_dense5'    : 'encoder_stack_layer5_dense',
+    'enc_dense7'    : 'encoder_stack_layer7_dense',
+    'enc_dense8'    : 'encoder_stack_layer8_dense',
+    'gdense1'       : 'encoder_state_layer1_dense',
+    'gdense2'       : 'encoder_state_layer2_dense',
+    'enc_dense2'    : 'encoder_stack_layer2_gru',
+    'enc_dense4'    : 'encoder_stack_layer4_gru',
+    'enc_dense6'    : 'encoder_stack_layer6_gru',
+    'bits_dense'    : 'encoder_stack_layer9_conv',
+    'qembedding'    : 'statistical_model_embedding',
+    'state1'        : 'decoder_state1_dense',
+    'state2'        : 'decoder_state2_dense',
+    'state3'        : 'decoder_state3_dense',
+    'dec_dense1'    : 'decoder_stack_layer1_dense',
+    'dec_dense3'    : 'decoder_stack_layer3_dense',
+    'dec_dense5'    : 'decoder_stack_layer5_dense',
+    'dec_dense7'    : 'decoder_stack_layer7_dense',
+    'dec_dense8'    : 'decoder_stack_layer8_dense',
+    'dec_final'     : 'decoder_stack_layer9_dense',
+    'dec_dense2'    : 'decoder_stack_layer2_gru',
+    'dec_dense4'    : 'decoder_stack_layer4_gru',
+    'dec_dense6'    : 'decoder_stack_layer6_gru'
+}
+
+
+if __name__ == "__main__":
+
+    model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
+    model.load_weights(args.weights)
+
+    os.makedirs(args.output, exist_ok=True)
+
+    # encoder
+    encoder_dense_names = [
+        'enc_dense1',
+        'enc_dense3',
+        'enc_dense5',
+        'enc_dense7',
+        'enc_dense8',
+        'gdense1',
+        'gdense2'
+    ]
+
+    encoder_gru_names = [
+        'enc_dense2',
+        'enc_dense4',
+        'enc_dense6'
+    ]
+
+    encoder_conv1d_names = [
+        'bits_dense'
+    ]
+
+
+    for name in encoder_dense_names + encoder_gru_names + encoder_conv1d_names:
+        print(f"writing layer {exchange_name[name]}...")
+        dump_tf_weights(os.path.join(args.output, exchange_name[name]), encoder.get_layer(name))
+
+    # qembedding
+    print(f"writing layer {exchange_name['qembedding']}...")
+    dump_tf_weights(os.path.join(args.output, exchange_name['qembedding']), qembedding)
+   
+    # decoder
+    decoder_dense_names = [
+        'state1',
+        'state2',
+        'state3',
+        'dec_dense1',
+        'dec_dense3',
+        'dec_dense5',
+        'dec_dense7',
+        'dec_dense8',
+        'dec_final'
+    ]   
+
+    decoder_gru_names = [
+        'dec_dense2',
+        'dec_dense4',
+        'dec_dense6'
+    ]
+
+    for name in decoder_dense_names + decoder_gru_names:
+        print(f"writing layer {exchange_name[name]}...")
+        dump_tf_weights(os.path.join(args.output, exchange_name[name]), decoder.get_layer(name))
diff --git a/dnn/training_tf2/rdovae_import.py b/dnn/training_tf2/rdovae_import.py
new file mode 100644
index 00000000..f6aa1759
--- /dev/null
+++ b/dnn/training_tf2/rdovae_import.py
@@ -0,0 +1,123 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import argparse
+import os
+import sys
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('input', metavar="<input folder>", type=str, help='input exchange folder')
+parser.add_argument('weights', metavar="<weight file>", type=str, help='model weight file in hdf5 format')
+parser.add_argument('--cond-size', type=int, help="conditioning size (default: 256)", default=256)
+parser.add_argument('--latent-dim', type=int, help="dimension of latent space (default: 80)", default=80)
+parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 16)", default=16)
+
+args = parser.parse_args()
+
+# now import the heavy stuff
+from rdovae import new_rdovae_model
+from wexchange.tf import load_tf_weights
+
+
+exchange_name = {
+    'enc_dense1'    : 'encoder_stack_layer1_dense',
+    'enc_dense3'    : 'encoder_stack_layer3_dense',
+    'enc_dense5'    : 'encoder_stack_layer5_dense',
+    'enc_dense7'    : 'encoder_stack_layer7_dense',
+    'enc_dense8'    : 'encoder_stack_layer8_dense',
+    'gdense1'       : 'encoder_state_layer1_dense',
+    'gdense2'       : 'encoder_state_layer2_dense',
+    'enc_dense2'    : 'encoder_stack_layer2_gru',
+    'enc_dense4'    : 'encoder_stack_layer4_gru',
+    'enc_dense6'    : 'encoder_stack_layer6_gru',
+    'bits_dense'    : 'encoder_stack_layer9_conv',
+    'qembedding'    : 'statistical_model_embedding',
+    'state1'        : 'decoder_state1_dense',
+    'state2'        : 'decoder_state2_dense',
+    'state3'        : 'decoder_state3_dense',
+    'dec_dense1'    : 'decoder_stack_layer1_dense',
+    'dec_dense3'    : 'decoder_stack_layer3_dense',
+    'dec_dense5'    : 'decoder_stack_layer5_dense',
+    'dec_dense7'    : 'decoder_stack_layer7_dense',
+    'dec_dense8'    : 'decoder_stack_layer8_dense',
+    'dec_final'     : 'decoder_stack_layer9_dense',
+    'dec_dense2'    : 'decoder_stack_layer2_gru',
+    'dec_dense4'    : 'decoder_stack_layer4_gru',
+    'dec_dense6'    : 'decoder_stack_layer6_gru'
+}
+
+if __name__ == "__main__":
+
+    model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
+    
+    encoder_layers = [
+        'enc_dense1',
+        'enc_dense3',
+        'enc_dense5',
+        'enc_dense7',
+        'enc_dense8',
+        'gdense1',
+        'gdense2',
+        'enc_dense2',
+        'enc_dense4',
+        'enc_dense6',
+        'bits_dense'
+    ]
+    
+    decoder_layers = [
+        'state1',
+        'state2',
+        'state3',
+        'dec_dense1',
+        'dec_dense3',
+        'dec_dense5',
+        'dec_dense7',
+        'dec_dense8',
+        'dec_final',
+        'dec_dense2',
+        'dec_dense4',
+        'dec_dense6'
+    ]
+    
+    for name in encoder_layers:
+        print(f"loading weight for layer {name}...")
+        load_tf_weights(os.path.join(args.input, exchange_name[name]), encoder.get_layer(name))
+    
+    print(f"loading weight for layer qembedding...")
+    load_tf_weights(os.path.join(args.input, exchange_name['qembedding']), qembedding)
+    
+    for name in decoder_layers:
+        print(f"loading weight for layer {name}...")
+        load_tf_weights(os.path.join(args.input, exchange_name[name]), decoder.get_layer(name))
+        
+    model.save(args.weights)
+\ No newline at end of file
diff --git a/dnn/training_tf2/test_lpcnet.py b/dnn/training_tf2/test_lpcnet.py
new file mode 100755
index 00000000..fe09016c
--- /dev/null
+++ b/dnn/training_tf2/test_lpcnet.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python3
+'''Copyright (c) 2018 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+import argparse
+import sys
+
+import h5py
+import numpy as np
+
+import lpcnet
+from ulaw import ulaw2lin, lin2ulaw
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('model-file', type=str, help='model weight h5 file')
+parser.add_argument('--lpc-gamma', type=float, help='LPC weighting factor. WARNING: giving an inconsistent value here will severely degrade performance', default=1)
+
+args = parser.parse_args()
+
+filename = args.model_file
+with h5py.File(filename, "r") as f:
+    units = min(f['model_weights']['gru_a']['gru_a']['recurrent_kernel:0'].shape)
+    units2 = min(f['model_weights']['gru_b']['gru_b']['recurrent_kernel:0'].shape)
+    cond_size = min(f['model_weights']['feature_dense1']['feature_dense1']['kernel:0'].shape)
+    e2e = 'rc2lpc' in f['model_weights']
+
+
+model, enc, dec = lpcnet.new_lpcnet_model(training = False, rnn_units1=units, rnn_units2=units2, flag_e2e = e2e, cond_size=cond_size, batch_size=1)
+
+model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+#model.summary()
+
+
+feature_file = sys.argv[2]
+out_file = sys.argv[3]
+frame_size = model.frame_size
+nb_features = 36
+nb_used_features = model.nb_used_features
+
+features = np.fromfile(feature_file, dtype='float32')
+features = np.resize(features, (-1, nb_features))
+nb_frames = 1
+feature_chunk_size = features.shape[0]
+pcm_chunk_size = frame_size*feature_chunk_size
+
+features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
+periods = (.1 + 50*features[:,:,18:19]+100).astype('int16')
+
+
+
+model.load_weights(filename);
+
+order = 16
+
+pcm = np.zeros((nb_frames*pcm_chunk_size, ))
+fexc = np.zeros((1, 1, 3), dtype='int16')+128
+state1 = np.zeros((1, model.rnn_units1), dtype='float32')
+state2 = np.zeros((1, model.rnn_units2), dtype='float32')
+
+mem = 0
+coef = 0.85
+
+lpc_weights = np.array([args.lpc_gamma ** (i + 1) for i in range(16)])
+
+fout = open(out_file, 'wb')
+
+skip = order + 1
+for c in range(0, nb_frames):
+    if not e2e:
+        cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])
+    else:
+        cfeat,lpcs = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])
+    for fr in range(0, feature_chunk_size):
+        f = c*feature_chunk_size + fr
+        if not e2e:
+            a = features[c, fr, nb_features-order:] * lpc_weights
+        else:
+            a = lpcs[c,fr]
+        for i in range(skip, frame_size):
+            pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])
+            fexc[0, 0, 1] = lin2ulaw(pred)
+
+            p, state1, state2 = dec.predict([fexc, cfeat[:, fr:fr+1, :], state1, state2])
+            #Lower the temperature for voiced frames to reduce noisiness
+            p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 19] - .5))
+            p = p/(1e-18 + np.sum(p))
+            #Cut off the tail of the remaining distribution
+            p = np.maximum(p-0.002, 0).astype('float64')
+            p = p/(1e-8 + np.sum(p))
+
+            fexc[0, 0, 2] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))
+            pcm[f*frame_size + i] = pred + ulaw2lin(fexc[0, 0, 2])
+            fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i])
+            mem = coef*mem + pcm[f*frame_size + i]
+            #print(mem)
+            np.array([np.round(mem)], dtype='int16').tofile(fout)
+        skip = 0
+
+
diff --git a/dnn/training_tf2/test_plc.py b/dnn/training_tf2/test_plc.py
new file mode 100644
index 00000000..0c0ac25f
--- /dev/null
+++ b/dnn/training_tf2/test_plc.py
@@ -0,0 +1,92 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2018-2019 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Test a PLC model')
+
+parser.add_argument('weights', metavar='<weights file>', help='weights file (.h5)')
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('output', metavar='<output>', help='reconstructed file (float32)')
+parser.add_argument('--model', metavar='<model>', default='lpcnet_plc', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+
+parser.add_argument('--gru-size', metavar='<units>', default=256, type=int, help='number of units in GRU (default 256)')
+parser.add_argument('--cond-size', metavar='<units>', default=128, type=int, help='number of units in conditioning network (default 128)')
+
+
+args = parser.parse_args()
+
+import importlib
+lpcnet = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+#gpus = tf.config.experimental.list_physical_devices('GPU')
+#if gpus:
+#  try:
+#    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+#  except RuntimeError as e:
+#    print(e)
+
+model = lpcnet.new_lpcnet_plc_model(rnn_units=args.gru_size, batch_size=1, training=False, quantize=False, cond_size=args.cond_size)
+model.compile()
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+features = np.loadtxt(feature_file)
+print(features.shape)
+sequence_size = features.shape[0]
+lost = np.reshape(features[:,-1:], (1, sequence_size, 1))
+features = features[:,:nb_used_features]
+features = np.reshape(features, (1, sequence_size, nb_used_features))
+
+
+model.load_weights(args.weights)
+
+features = features*lost
+out = model.predict([features, lost])
+
+out = features + (1-lost)*out
+
+np.savetxt(args.output, out[0,:,:])
diff --git a/dnn/training_tf2/tf_funcs.py b/dnn/training_tf2/tf_funcs.py
new file mode 100644
index 00000000..5e065012
--- /dev/null
+++ b/dnn/training_tf2/tf_funcs.py
@@ -0,0 +1,70 @@
+"""
+Tensorflow/Keras helper functions to do the following:
+    1. \mu law <-> Linear domain conversion
+    2. Differentiable prediction from the input signal and LP coefficients
+    3. Differentiable transformations Reflection Coefficients (RCs) <-> LP Coefficients
+"""
+from tensorflow.keras.layers import Lambda, Multiply, Layer, Concatenate
+from tensorflow.keras import backend as K
+import tensorflow as tf
+
+# \mu law <-> Linear conversion functions
+scale = 255.0/32768.0
+scale_1 = 32768.0/255.0
+def tf_l2u(x):
+    s = K.sign(x)
+    x = K.abs(x)
+    u = (s*(128*K.log(1+scale*x)/K.log(256.0)))
+    u = K.clip(128 + u, 0, 255)
+    return u
+
+def tf_u2l(u):
+    u = tf.cast(u,"float32")
+    u = u - 128.0
+    s = K.sign(u)
+    u = K.abs(u)
+    return s*scale_1*(K.exp(u/128.*K.log(256.0))-1)
+
+# Differentiable Prediction Layer
+# Computes the LP prediction from the input lag signal and the LP coefficients
+# The inputs xt and lpc conform with the shapes in lpcnet.py (the '2400' is coded keeping this in mind)
+class diff_pred(Layer):
+    def call(self, inputs, lpcoeffs_N = 16, frame_size = 160):
+        xt = inputs[0]
+        lpc = inputs[1]
+
+        rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1))
+        zpX = Lambda(lambda x: K.concatenate([0*x[:,0:lpcoeffs_N,:], x],axis = 1))
+        cX = Lambda(lambda x: K.concatenate([x[:,(lpcoeffs_N - i):(lpcoeffs_N - i + 2400),:] for i in range(lpcoeffs_N)],axis = 2))
+        
+        pred = -Multiply()([rept(lpc),cX(zpX(xt))])
+
+        return K.sum(pred,axis = 2,keepdims = True)
+
+# Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion 
+class diff_rc2lpc(Layer):
+    def call(self, inputs, lpcoeffs_N = 16):
+        def pred_lpc_recursive(input):
+            temp = (input[0] + K.repeat_elements(input[1],input[0].shape[2],2)*K.reverse(input[0],axes = 2))
+            temp = Concatenate(axis = 2)([temp,input[1]])
+            return temp
+        Llpc = Lambda(pred_lpc_recursive)
+        inputs = inputs[:,:,:lpcoeffs_N]
+        lpc_init = inputs
+        for i in range(1,lpcoeffs_N):
+            lpc_init = Llpc([lpc_init[:,:,:i],K.expand_dims(inputs[:,:,i],axis = -1)])
+        return lpc_init
+
+class diff_lpc2rc(Layer):
+    def call(self, inputs, lpcoeffs_N = 16):
+        def pred_rc_recursive(input):
+            ki = K.repeat_elements(K.expand_dims(input[1][:,:,0],axis = -1),input[0].shape[2],2)
+            temp = (input[0] - ki*K.reverse(input[0],axes = 2))/(1 - ki*ki)
+            temp = Concatenate(axis = 2)([temp,input[1]])
+            return temp
+        Lrc = Lambda(pred_rc_recursive)
+        rc_init = inputs
+        for i in range(1,lpcoeffs_N):
+            j = (lpcoeffs_N - i + 1)
+            rc_init = Lrc([rc_init[:,:,:(j - 1)],rc_init[:,:,(j - 1):]])
+        return rc_init
diff --git a/dnn/training_tf2/train_lpcnet.py b/dnn/training_tf2/train_lpcnet.py
new file mode 100755
index 00000000..bc3f4373
--- /dev/null
+++ b/dnn/training_tf2/train_lpcnet.py
@@ -0,0 +1,214 @@
+#!/usr/bin/python3
+'''Copyright (c) 2018 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+import os
+
+from dataloader import LPCNetLoader
+
+parser = argparse.ArgumentParser(description='Train an LPCNet model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('data', metavar='<audio data file>', help='binary audio data file (uint8)')
+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')
+parser.add_argument('--model', metavar='<model>', default='lpcnet', help='LPCNet model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--quantize', metavar='<input weights>', help='quantize model')
+group1.add_argument('--retrain', metavar='<input weights>', help='continue training model')
+parser.add_argument('--density', metavar='<global density>', type=float, help='average density of the recurrent weights (default 0.1)')
+parser.add_argument('--density-split', nargs=3, metavar=('<update>', '<reset>', '<state>'), type=float, help='density of each recurrent gate (default 0.05, 0.05, 0.2)')
+parser.add_argument('--grub-density', metavar='<global GRU B density>', type=float, help='average density of the recurrent weights (default 1.0)')
+parser.add_argument('--grub-density-split', nargs=3, metavar=('<update>', '<reset>', '<state>'), type=float, help='density of each GRU B input gate (default 1.0, 1.0, 1.0)')
+parser.add_argument('--grua-size', metavar='<units>', default=384, type=int, help='number of units in GRU A (default 384)')
+parser.add_argument('--grub-size', metavar='<units>', default=16, type=int, help='number of units in GRU B (default 16)')
+parser.add_argument('--cond-size', metavar='<units>', default=128, type=int, help='number of units in conditioning network, aka frame rate network (default 128)')
+parser.add_argument('--epochs', metavar='<epochs>', default=120, type=int, help='number of epochs to train for (default 120)')
+parser.add_argument('--batch-size', metavar='<batch size>', default=128, type=int, help='batch size to use (default 128)')
+parser.add_argument('--end2end', dest='flag_e2e', action='store_true', help='Enable end-to-end training (with differentiable LPC computation')
+parser.add_argument('--lr', metavar='<learning rate>', type=float, help='learning rate')
+parser.add_argument('--decay', metavar='<decay>', type=float, help='learning rate decay')
+parser.add_argument('--gamma', metavar='<gamma>', type=float, help='adjust u-law compensation (default 2.0, should not be less than 1.0)')
+parser.add_argument('--lookahead', metavar='<nb frames>', default=2, type=int, help='Number of look-ahead frames (default 2)')
+parser.add_argument('--logdir', metavar='<log dir>', help='directory for tensorboard log files')
+parser.add_argument('--lpc-gamma', type=float, default=1, help='gamma for LPC weighting')
+parser.add_argument('--cuda-devices', metavar='<cuda devices>', type=str, default=None, help='string with comma separated cuda device ids')
+
+args = parser.parse_args()
+
+# set visible cuda devices
+if args.cuda_devices != None:
+    os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices
+
+density = (0.05, 0.05, 0.2)
+if args.density_split is not None:
+    density = args.density_split
+elif args.density is not None:
+    density = [0.5*args.density, 0.5*args.density, 2.0*args.density];
+
+grub_density = (1., 1., 1.)
+if args.grub_density_split is not None:
+    grub_density = args.grub_density_split
+elif args.grub_density is not None:
+    grub_density = [0.5*args.grub_density, 0.5*args.grub_density, 2.0*args.grub_density];
+
+gamma = 2.0 if args.gamma is None else args.gamma
+
+import importlib
+lpcnet = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+from ulaw import ulaw2lin, lin2ulaw
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+from tf_funcs import *
+from lossfuncs import *
+#gpus = tf.config.experimental.list_physical_devices('GPU')
+#if gpus:
+#  try:
+#    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+#  except RuntimeError as e:
+#    print(e)
+
+nb_epochs = args.epochs
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+quantize = args.quantize is not None
+retrain = args.retrain is not None
+
+lpc_order = 16
+
+if quantize:
+    lr = 0.00003
+    decay = 0
+    input_model = args.quantize
+else:
+    lr = 0.001
+    decay = 5e-5
+
+if args.lr is not None:
+    lr = args.lr
+
+if args.decay is not None:
+    decay = args.decay
+
+if retrain:
+    input_model = args.retrain
+
+flag_e2e = args.flag_e2e
+
+opt = Adam(lr, decay=decay, beta_1=0.5, beta_2=0.8)
+strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+with strategy.scope():
+    model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size,
+                                          rnn_units2=args.grub_size, 
+                                          batch_size=batch_size, training=True,
+                                          quantize=quantize,
+                                          flag_e2e=flag_e2e,
+                                          cond_size=args.cond_size,
+                                          lpc_gamma=args.lpc_gamma,
+                                          lookahead=args.lookahead
+                                          )
+    if not flag_e2e:
+        model.compile(optimizer=opt, loss=metric_cel, metrics=metric_cel)
+    else:
+        model.compile(optimizer=opt, loss = [interp_mulaw(gamma=gamma), loss_matchlar()], loss_weights = [1.0, 2.0], metrics={'pdf':[metric_cel,metric_icel,metric_exc_sd,metric_oginterploss]})
+    model.summary()
+
+feature_file = args.features
+pcm_file = args.data     # 16 bit unsigned short PCM samples
+frame_size = model.frame_size
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+feature_chunk_size = 15
+pcm_chunk_size = frame_size*feature_chunk_size
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+data = np.memmap(pcm_file, dtype='int16', mode='r')
+nb_frames = (len(data)//(2*pcm_chunk_size)-1)//batch_size*batch_size
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+
+# limit to discrete number of frames
+data = data[(4-args.lookahead)*2*frame_size:]
+data = data[:nb_frames*2*pcm_chunk_size]
+
+
+data = np.reshape(data, (nb_frames, pcm_chunk_size, 2))
+
+#print("ulaw std = ", np.std(out_exc))
+
+sizeof = features.strides[-1]
+features = np.lib.stride_tricks.as_strided(features, shape=(nb_frames, feature_chunk_size+4, nb_features),
+                                           strides=(feature_chunk_size*nb_features*sizeof, nb_features*sizeof, sizeof))
+#features = features[:, :, :nb_used_features]
+
+
+periods = (.1 + 50*features[:,:,nb_used_features-2:nb_used_features-1]+100).astype('int16')
+#periods = np.minimum(periods, 255)
+
+# dump models to disk as we go
+checkpoint = ModelCheckpoint('{}_{}_{}.h5'.format(args.output, args.grua_size, '{epoch:02d}'))
+
+if args.retrain is not None:
+    model.load_weights(args.retrain)
+
+if quantize or retrain:
+    #Adapting from an existing model
+    model.load_weights(input_model)
+    if quantize:
+        sparsify = lpcnet.Sparsify(10000, 30000, 100, density, quantize=True)
+        grub_sparsify = lpcnet.SparsifyGRUB(10000, 30000, 100, args.grua_size, grub_density, quantize=True)
+    else:
+        sparsify = lpcnet.Sparsify(0, 0, 1, density)
+        grub_sparsify = lpcnet.SparsifyGRUB(0, 0, 1, args.grua_size, grub_density)
+else:
+    #Training from scratch
+    sparsify = lpcnet.Sparsify(2000, 20000, 400, density)
+    grub_sparsify = lpcnet.SparsifyGRUB(2000, 40000, 400, args.grua_size, grub_density)
+
+model.save_weights('{}_{}_initial.h5'.format(args.output, args.grua_size))
+
+loader = LPCNetLoader(data, features, periods, batch_size, e2e=flag_e2e, lookahead=args.lookahead)
+
+callbacks = [checkpoint, sparsify, grub_sparsify]
+if args.logdir is not None:
+    logdir = '{}/{}_{}_logs'.format(args.logdir, args.output, args.grua_size)
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
+    callbacks.append(tensorboard_callback)
+
+model.fit(loader, epochs=nb_epochs, validation_split=0.0, callbacks=callbacks)
diff --git a/dnn/training_tf2/train_plc.py b/dnn/training_tf2/train_plc.py
new file mode 100644
index 00000000..ca30c457
--- /dev/null
+++ b/dnn/training_tf2/train_plc.py
@@ -0,0 +1,197 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2018-2019 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a PLC model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('lost_file', metavar='<packet loss file>', help='packet loss traces (int8)')
+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')
+parser.add_argument('--model', metavar='<model>', default='lpcnet_plc', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--quantize', metavar='<input weights>', help='quantize model')
+group1.add_argument('--retrain', metavar='<input weights>', help='continue training model')
+parser.add_argument('--gru-size', metavar='<units>', default=256, type=int, help='number of units in GRU (default 256)')
+parser.add_argument('--cond-size', metavar='<units>', default=128, type=int, help='number of units in conditioning network (default 128)')
+parser.add_argument('--epochs', metavar='<epochs>', default=120, type=int, help='number of epochs to train for (default 120)')
+parser.add_argument('--batch-size', metavar='<batch size>', default=128, type=int, help='batch size to use (default 128)')
+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')
+parser.add_argument('--lr', metavar='<learning rate>', type=float, help='learning rate')
+parser.add_argument('--decay', metavar='<decay>', type=float, help='learning rate decay')
+parser.add_argument('--band-loss', metavar='<weight>', default=1.0, type=float, help='weight of band loss (default 1.0)')
+parser.add_argument('--loss-bias', metavar='<bias>', default=0.0, type=float, help='loss bias towards low energy (default 0.0)')
+parser.add_argument('--logdir', metavar='<log dir>', help='directory for tensorboard log files')
+
+
+args = parser.parse_args()
+
+import importlib
+lpcnet = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+#gpus = tf.config.experimental.list_physical_devices('GPU')
+#if gpus:
+#  try:
+#    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+#  except RuntimeError as e:
+#    print(e)
+
+nb_epochs = args.epochs
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+quantize = args.quantize is not None
+retrain = args.retrain is not None
+
+if quantize:
+    lr = 0.00003
+    decay = 0
+    input_model = args.quantize
+else:
+    lr = 0.001
+    decay = 2.5e-5
+
+if args.lr is not None:
+    lr = args.lr
+
+if args.decay is not None:
+    decay = args.decay
+
+if retrain:
+    input_model = args.retrain
+
+def plc_loss(alpha=1.0, bias=0.):
+    def loss(y_true,y_pred):
+        mask = y_true[:,:,-1:]
+        y_true = y_true[:,:,:-1]
+        e = (y_pred - y_true)*mask
+        e_bands = tf.signal.idct(e[:,:,:-2], norm='ortho')
+        bias_mask = K.minimum(1., K.maximum(0., 4*y_true[:,:,-1:]))
+        l1_loss = K.mean(K.abs(e)) + 0.1*K.mean(K.maximum(0., -e[:,:,-1:])) + alpha*K.mean(K.abs(e_bands) + bias*bias_mask*K.maximum(0., e_bands)) + K.mean(K.minimum(K.abs(e[:,:,18:19]),1.)) + 8*K.mean(K.minimum(K.abs(e[:,:,18:19]),.4))
+        return l1_loss
+    return loss
+
+def plc_l1_loss():
+    def L1_loss(y_true,y_pred):
+        mask = y_true[:,:,-1:]
+        y_true = y_true[:,:,:-1]
+        e = (y_pred - y_true)*mask
+        l1_loss = K.mean(K.abs(e))
+        return l1_loss
+    return L1_loss
+
+def plc_ceps_loss():
+    def ceps_loss(y_true,y_pred):
+        mask = y_true[:,:,-1:]
+        y_true = y_true[:,:,:-1]
+        e = (y_pred - y_true)*mask
+        l1_loss = K.mean(K.abs(e[:,:,:-2]))
+        return l1_loss
+    return ceps_loss
+
+def plc_band_loss():
+    def L1_band_loss(y_true,y_pred):
+        mask = y_true[:,:,-1:]
+        y_true = y_true[:,:,:-1]
+        e = (y_pred - y_true)*mask
+        e_bands = tf.signal.idct(e[:,:,:-2], norm='ortho')
+        l1_loss = K.mean(K.abs(e_bands))
+        return l1_loss
+    return L1_band_loss
+
+def plc_pitch_loss():
+    def pitch_loss(y_true,y_pred):
+        mask = y_true[:,:,-1:]
+        y_true = y_true[:,:,:-1]
+        e = (y_pred - y_true)*mask
+        l1_loss = K.mean(K.minimum(K.abs(e[:,:,18:19]),.4))
+        return l1_loss
+    return pitch_loss
+
+opt = Adam(lr, decay=decay, beta_2=0.99)
+strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+with strategy.scope():
+    model = lpcnet.new_lpcnet_plc_model(rnn_units=args.gru_size, batch_size=batch_size, training=True, quantize=quantize, cond_size=args.cond_size)
+    model.compile(optimizer=opt, loss=plc_loss(alpha=args.band_loss, bias=args.loss_bias), metrics=[plc_l1_loss(), plc_ceps_loss(), plc_band_loss(), plc_pitch_loss()])
+    model.summary()
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order + model.nb_burg_features
+nb_used_features = model.nb_used_features
+nb_burg_features = model.nb_burg_features
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+nb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
+features = features[:nb_sequences*sequence_size*nb_features]
+
+features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
+
+features = features[:, :, :nb_used_features+model.nb_burg_features]
+
+lost = np.memmap(args.lost_file, dtype='int8', mode='r')
+
+# dump models to disk as we go
+checkpoint = ModelCheckpoint('{}_{}_{}.h5'.format(args.output, args.gru_size, '{epoch:02d}'))
+
+if args.retrain is not None:
+    model.load_weights(args.retrain)
+
+if quantize or retrain:
+    #Adapting from an existing model
+    model.load_weights(input_model)
+
+model.save_weights('{}_{}_initial.h5'.format(args.output, args.gru_size))
+
+loader = PLCLoader(features, lost, nb_burg_features, batch_size)
+
+callbacks = [checkpoint]
+if args.logdir is not None:
+    logdir = '{}/{}_{}_logs'.format(args.logdir, args.output, args.gru_size)
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
+    callbacks.append(tensorboard_callback)
+
+model.fit(loader, epochs=nb_epochs, validation_split=0.0, callbacks=callbacks)
diff --git a/dnn/training_tf2/train_rdovae.py b/dnn/training_tf2/train_rdovae.py
new file mode 100644
index 00000000..b474b5f6
--- /dev/null
+++ b/dnn/training_tf2/train_rdovae.py
@@ -0,0 +1,151 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+   Copyright (c) 2018-2019 Mozilla
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+import tensorflow as tf
+strategy = tf.distribute.MultiWorkerMirroredStrategy()
+
+
+import argparse
+#from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a quantization model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')
+parser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--quantize', metavar='<input weights>', help='quantize model')
+group1.add_argument('--retrain', metavar='<input weights>', help='continue training model')
+parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')
+parser.add_argument('--epochs', metavar='<epochs>', default=120, type=int, help='number of epochs to train for (default 120)')
+parser.add_argument('--batch-size', metavar='<batch size>', default=128, type=int, help='batch size to use (default 128)')
+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')
+parser.add_argument('--lr', metavar='<learning rate>', type=float, help='learning rate')
+parser.add_argument('--decay', metavar='<decay>', type=float, help='learning rate decay')
+parser.add_argument('--logdir', metavar='<log dir>', help='directory for tensorboard log files')
+
+
+args = parser.parse_args()
+
+import importlib
+rdovae = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+#gpus = tf.config.experimental.list_physical_devices('GPU')
+#if gpus:
+#  try:
+#    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+#  except RuntimeError as e:
+#    print(e)
+
+nb_epochs = args.epochs
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+quantize = args.quantize is not None
+retrain = args.retrain is not None
+
+if quantize:
+    lr = 0.00003
+    decay = 0
+    input_model = args.quantize
+else:
+    lr = 0.001
+    decay = 2.5e-5
+
+if args.lr is not None:
+    lr = args.lr
+
+if args.decay is not None:
+    decay = args.decay
+
+if retrain:
+    input_model = args.retrain
+
+
+opt = Adam(lr, decay=decay, beta_2=0.99)
+
+with strategy.scope():
+    model, encoder, decoder, _ = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size, nb_quant=16)
+    model.compile(optimizer=opt, loss=[rdovae.feat_dist_loss, rdovae.feat_dist_loss, rdovae.sq1_rate_loss, rdovae.sq2_rate_loss], loss_weights=[.5, .5, 1., .1], metrics={'hard_bits':rdovae.sq_rate_metric})
+    model.summary()
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+nb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
+features = features[:nb_sequences*sequence_size*nb_features]
+
+features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
+print(features.shape)
+features = features[:, :, :nb_used_features]
+
+#lambda_val = np.repeat(np.random.uniform(.0007, .002, (features.shape[0], 1, 1)), features.shape[1]//2, axis=1)
+#quant_id = np.round(10*np.log(lambda_val/.0007)).astype('int16')
+#quant_id = quant_id[:,:,0]
+quant_id = np.repeat(np.random.randint(16, size=(features.shape[0], 1, 1), dtype='int16'), features.shape[1]//2, axis=1)
+lambda_val = .0002*np.exp(quant_id/3.8)
+quant_id = quant_id[:,:,0]
+
+# dump models to disk as we go
+checkpoint = ModelCheckpoint('{}_{}_{}.h5'.format(args.output, args.cond_size, '{epoch:02d}'))
+
+if args.retrain is not None:
+    model.load_weights(args.retrain)
+
+if quantize or retrain:
+    #Adapting from an existing model
+    model.load_weights(input_model)
+
+model.save_weights('{}_{}_initial.h5'.format(args.output, args.cond_size))
+
+callbacks = [checkpoint]
+#callbacks = []
+
+if args.logdir is not None:
+    logdir = '{}/{}_{}_logs'.format(args.logdir, args.output, args.cond_size)
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
+    callbacks.append(tensorboard_callback)
+
+model.fit([features, quant_id, lambda_val], [features, features, features, features], batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=callbacks)
diff --git a/dnn/training_tf2/ulaw.py b/dnn/training_tf2/ulaw.py
new file mode 100644
index 00000000..b79d4315
--- /dev/null
+++ b/dnn/training_tf2/ulaw.py
@@ -0,0 +1,19 @@
+
+import numpy as np
+import math
+
+scale = 255.0/32768.0
+scale_1 = 32768.0/255.0
+def ulaw2lin(u):
+    u = u - 128
+    s = np.sign(u)
+    u = np.abs(u)
+    return s*scale_1*(np.exp(u/128.*math.log(256))-1)
+
+
+def lin2ulaw(x):
+    s = np.sign(x)
+    x = np.abs(x)
+    u = (s*(128*np.log(1+scale*x)/math.log(256)))
+    u = np.clip(128 + np.round(u), 0, 255)
+    return u.astype('int16')
diff --git a/dnn/training_tf2/uniform_noise.py b/dnn/training_tf2/uniform_noise.py
new file mode 100644
index 00000000..6197dd5f
--- /dev/null
+++ b/dnn/training_tf2/uniform_noise.py
@@ -0,0 +1,78 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the UniformNoise layer."""
+
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.keras import backend
+
+from tensorflow.keras.layers import Layer
+
+class UniformNoise(Layer):
+    """Apply additive zero-centered uniform noise.
+
+    This is useful to mitigate overfitting
+    (you could see it as a form of random data augmentation).
+    Gaussian Noise (GS) is a natural choice as corruption process
+    for real valued inputs.
+
+    As it is a regularization layer, it is only active at training time.
+
+    Args:
+      stddev: Float, standard deviation of the noise distribution.
+      seed: Integer, optional random seed to enable deterministic behavior.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding noise) or in inference mode (doing nothing).
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+    Output shape:
+      Same shape as input.
+    """
+
+
+
+
+    def __init__(self, stddev=0.5, seed=None, **kwargs):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.stddev = stddev
+
+
+    def call(self, inputs, training=None):
+        def noised():
+            return inputs + backend.random_uniform(
+                shape=tf.shape(inputs),
+                minval=-self.stddev,
+                maxval=self.stddev,
+                dtype=inputs.dtype,
+            )
+
+        return backend.in_train_phase(noised, inputs, training=training)
+
+    def get_config(self):
+        config = {"stddev": self.stddev}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/dnn/update_version b/dnn/update_version
new file mode 100755
index 00000000..a9999918
--- /dev/null
+++ b/dnn/update_version
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Creates and updates the package_version information used by configure.ac
+# (or other makefiles).  When run inside a git repository it will use the
+# version information that can be queried from it unless AUTO_UPDATE is set
+# to 'no'.  If no version is currently known it will be set to 'unknown'.
+#
+# If called with the argument 'release', the PACKAGE_VERSION will be updated
+# even if AUTO_UPDATE=no, but the value of AUTO_UPDATE shall be preserved.
+# This is used to force a version update whenever `make dist` is run.
+#
+# The exit status is 1 if package_version is not modified, else 0 is returned.
+#
+# This script should NOT be included in distributed tarballs, because if a
+# parent directory contains a git repository we do not want to accidentally
+# retrieve the version information from it instead.  Tarballs should ship
+# with only the package_version file.
+#
+# Ron <ron@debian.org>, 2012.
+
+SRCDIR=$(dirname $0)
+
+if [ -e "$SRCDIR/package_version" ]; then
+    . "$SRCDIR/package_version"
+fi
+
+if [ "$AUTO_UPDATE" = no ]; then
+    [ "$1" = release ] || exit 1
+else
+    AUTO_UPDATE=yes
+fi
+
+# We run `git status` before describe here to ensure that we don't get a false
+# -dirty from files that have been touched but are not actually altered in the
+# working dir.
+GIT_VERSION=$(cd "$SRCDIR" && git status > /dev/null 2>&1 \
+                           && git describe --tags --match 'v*' --dirty 2> /dev/null)
+GIT_VERSION=${GIT_VERSION#v}
+
+if [ -n "$GIT_VERSION" ]; then
+
+    [ "$GIT_VERSION" != "$PACKAGE_VERSION" ] || exit 1
+    PACKAGE_VERSION="$GIT_VERSION"
+
+elif [ -z "$PACKAGE_VERSION" ]; then
+    # No current package_version and no git ...
+    # We really shouldn't ever get here, because this script should only be
+    # included in the git repository, and should usually be export-ignored.
+    PACKAGE_VERSION="unknown"
+else
+    exit 1
+fi
+
+cat > "$SRCDIR/package_version" <<-EOF
+	# Automatically generated by update_version.
+	# This file may be sourced into a shell script or makefile.
+
+	# Set this to 'no' if you do not wish the version information
+	# to be checked and updated for every build.  Most people will
+	# never want to change this, it is an option for developers
+	# making frequent changes that they know will not be released.
+	AUTO_UPDATE=$AUTO_UPDATE
+
+	PACKAGE_VERSION="$PACKAGE_VERSION"
+EOF
diff --git a/dnn/vec.h b/dnn/vec.h
new file mode 100644
index 00000000..c7f2e695
--- /dev/null
+++ b/dnn/vec.h
@@ -0,0 +1,409 @@
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef VEC_H
+#define VEC_H
+
+#include "tansig_table.h"
+#include "opus_types.h"
+#include <math.h>
+#include "arch.h"
+
+
+#if defined(__AVX__) || defined(__SSE2__)
+#include "vec_avx.h"
+#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON)
+#include "vec_neon.h"
+#else
+
+#define MAX_INPUTS (2048)
+
+#define NO_OPTIMIZATIONS
+
+#ifndef DISABLE_DOT_PROD
+#define DOT_PROD
+/*#define USE_SU_BIAS*/
+#endif
+
+#ifdef DOT_PROD
+typedef signed char qweight;
+#else
+typedef float qweight;
+#endif
+
+
+/* No AVX2/FMA support */
+#ifndef LPCNET_TEST
+static inline float lpcnet_exp2(float x)
+{
+   int integer;
+   float frac;
+   union {
+      float f;
+      opus_uint32 i;
+   } res;
+   integer = floor(x);
+   if (integer < -50)
+      return 0;
+   frac = x-integer;
+   /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+   res.f = 0.99992522f + frac * (0.69583354f
+           + frac * (0.22606716f + 0.078024523f*frac));
+   res.i = (res.i + (integer<<23)) & 0x7fffffff;
+   return res.f;
+}
+#define lpcnet_exp(x) lpcnet_exp2((x)*1.44269504f)
+
+static inline float tanh_approx(float x)
+{
+    int i;
+    float y, dy;
+    float sign=1;
+    if (x<0)
+    {
+       x=-x;
+       sign=-1;
+    }
+    i = (int)floor(.5f+25*x);
+    i = IMAX(0, IMIN(200, i));
+    x -= .04f*i;
+    y = tansig_table[i];
+    dy = 1-y*y;
+    y = y + x*dy*(1 - y*x);
+    return sign*y;
+}
+
+static inline float sigmoid_approx(float x)
+{
+   return .5f + .5f*tanh_approx(.5f*x);
+}
+
+static inline void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+        y[i] = lpcnet_exp(x[i]);
+}
+
+static inline void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = tanh_approx(x[i]);
+    }
+}
+
+static inline void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = sigmoid_approx(x[i]);
+    }
+}
+#endif
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      for (j=0;j<cols;j++)
+      {
+         const float * restrict w;
+         float * restrict y;
+         float xj;
+         w = &weights[j*col_stride + i];
+         xj = x[j];
+         y = &out[i];
+         y[0] += w[0]*xj;
+         y[1] += w[1]*xj;
+         y[2] += w[2]*xj;
+         y[3] += w[3]*xj;
+         y[4] += w[4]*xj;
+         y[5] += w[5]*xj;
+         y[6] += w[6]*xj;
+         y[7] += w[7]*xj;
+         y[8] += w[8]*xj;
+         y[9] += w[9]*xj;
+         y[10] += w[10]*xj;
+         y[11] += w[11]*xj;
+         y[12] += w[12]*xj;
+         y[13] += w[13]*xj;
+         y[14] += w[14]*xj;
+         y[15] += w[15]*xj;
+      }
+   }
+}
+
+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      int cols;
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         float * restrict y;
+         float xj;
+         xj = x[*idx++];
+         y = &out[i];
+         y[0] += w[0]*xj;
+         y[1] += w[1]*xj;
+         y[2] += w[2]*xj;
+         y[3] += w[3]*xj;
+         y[4] += w[4]*xj;
+         y[5] += w[5]*xj;
+         y[6] += w[6]*xj;
+         y[7] += w[7]*xj;
+         y[8] += w[8]*xj;
+         y[9] += w[9]*xj;
+         y[10] += w[10]*xj;
+         y[11] += w[11]*xj;
+         y[12] += w[12]*xj;
+         y[13] += w[13]*xj;
+         y[14] += w[14]*xj;
+         y[15] += w[15]*xj;
+         w += 16;
+      }
+   }
+}
+
+#ifdef DOT_PROD
+
+#define SCALE (128.f*127.f)
+#define SCALE_1 (1.f/128.f/127.f)
+
+
+#ifdef USE_SU_BIAS
+
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{
+   int i, j;
+   unsigned char x[MAX_INPUTS];
+   (void)col_stride;
+   for (i=0;i<rows;i++) out[i] *= SCALE;
+   for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);
+   for (i=0;i<rows;i+=8)
+   {
+      for (j=0;j<cols;j+=4)
+      {
+         float * restrict y;
+         float xj0, xj1, xj2, xj3;
+         xj0 = x[j+0];
+         xj1 = x[j+1];
+         xj2 = x[j+2];
+         xj3 = x[j+3];
+         y = &out[i];
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         w += 32;
+      }
+   }
+   for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{
+   int i, j;
+   unsigned char x[MAX_INPUTS];
+   for (i=0;i<rows;i++) out[i] *= SCALE;
+   for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
+   for (i=0;i<rows;i+=8)
+   {
+      int colblocks;
+      colblocks = *idx++;
+      for (j=0;j<colblocks;j++)
+      {
+         int pos;
+         float * restrict y;
+         int xj0, xj1, xj2, xj3;
+         pos = (*idx++);
+         xj0 = x[pos+0];
+         xj1 = x[pos+1];
+         xj2 = x[pos+2];
+         xj3 = x[pos+3];
+         y = &out[i];
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         w += 32;
+      }
+   }
+   for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#else /*USE_SU_BIAS*/
+
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{
+   int i, j;
+   signed char x[MAX_INPUTS];
+   (void)col_stride;
+   for (i=0;i<rows;i++) out[i] *= SCALE;
+   for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
+   for (i=0;i<rows;i+=8)
+   {
+      for (j=0;j<cols;j+=4)
+      {
+         float * restrict y;
+         float xj0, xj1, xj2, xj3;
+         xj0 = x[j+0];
+         xj1 = x[j+1];
+         xj2 = x[j+2];
+         xj3 = x[j+3];
+         y = &out[i];
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         w += 32;
+      }
+   }
+   for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{
+   int i, j;
+   signed char x[MAX_INPUTS];
+   for (i=0;i<rows;i++) out[i] *= SCALE;
+   for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
+   for (i=0;i<rows;i+=8)
+   {
+      int colblocks;
+      colblocks = *idx++;
+      for (j=0;j<colblocks;j++)
+      {
+         int pos;
+         float * restrict y;
+         int xj0, xj1, xj2, xj3;
+         pos = (*idx++);
+         xj0 = x[pos+0];
+         xj1 = x[pos+1];
+         xj2 = x[pos+2];
+         xj3 = x[pos+3];
+         y = &out[i];
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         w += 32;
+      }
+   }
+   for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#endif /*USE_SU_BIAS*/
+
+#else /*DOT_PROD*/
+
+#define sgemv_accum8x4 sgemv_accum
+
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
+{
+   int i, j;
+   (void)ignore;
+   for (i=0;i<rows;i+=8)
+   {
+      int cols;
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int pos;
+         float * restrict y;
+         float xj0, xj1, xj2, xj3;
+         pos = (*idx++);
+         xj0 = x[pos+0];
+         xj1 = x[pos+1];
+         xj2 = x[pos+2];
+         xj3 = x[pos+3];
+         y = &out[i];
+         y[0] += w[0]*xj0;
+         y[1] += w[1]*xj0;
+         y[2] += w[2]*xj0;
+         y[3] += w[3]*xj0;
+         y[4] += w[4]*xj0;
+         y[5] += w[5]*xj0;
+         y[6] += w[6]*xj0;
+         y[7] += w[7]*xj0;
+
+         y[0] += w[8]*xj1;
+         y[1] += w[9]*xj1;
+         y[2] += w[10]*xj1;
+         y[3] += w[11]*xj1;
+         y[4] += w[12]*xj1;
+         y[5] += w[13]*xj1;
+         y[6] += w[14]*xj1;
+         y[7] += w[15]*xj1;
+
+         y[0] += w[16]*xj2;
+         y[1] += w[17]*xj2;
+         y[2] += w[18]*xj2;
+         y[3] += w[19]*xj2;
+         y[4] += w[20]*xj2;
+         y[5] += w[21]*xj2;
+         y[6] += w[22]*xj2;
+         y[7] += w[23]*xj2;
+
+         y[0] += w[24]*xj3;
+         y[1] += w[25]*xj3;
+         y[2] += w[26]*xj3;
+         y[3] += w[27]*xj3;
+         y[4] += w[28]*xj3;
+         y[5] += w[29]*xj3;
+         y[6] += w[30]*xj3;
+         y[7] += w[31]*xj3;
+         w += 32;
+      }
+   }
+}
+#endif /*DOT_PROD*/
+
+
+#endif /*no optimizations*/
+#endif /*VEC_H*/
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
new file mode 100644
index 00000000..733cf6a9
--- /dev/null
+++ b/dnn/vec_avx.h
@@ -0,0 +1,906 @@
+/* Copyright (c) 2018 Mozilla
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+  AVX implementation of vector operations, compile with -mavx
+  AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
+*/
+
+#ifndef VEC_AVX_H
+#define VEC_AVX_H
+
+#include <immintrin.h>
+#include <math.h>
+
+/* Use 8-bit dot products unless disabled or if stuck with SSE2. */
+#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD)
+#define DOT_PROD
+#define USE_SU_BIAS
+
+#else
+
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+
+#endif
+
+
+#ifndef __SSE_4_1__
+static inline __m128 mm_floor_ps(__m128 x) {
+  __m128 half = _mm_set1_ps(0.5);
+  return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(x, half)));
+}
+#undef _mm_floor_ps
+#define _mm_floor_ps(x) mm_floor_ps(x)
+#endif
+
+
+/* If we don't have AVX available, emulate what we need with SSE up to 4.1. */
+#ifndef __AVX__
+
+typedef struct {
+  __m128 lo;
+  __m128 hi;
+} mm256_emu;
+#define __m256 mm256_emu
+
+static inline mm256_emu mm256_loadu_ps(const float *src) {
+  mm256_emu ret;
+  ret.lo = _mm_loadu_ps(&src[0]);
+  ret.hi = _mm_loadu_ps(&src[4]);
+  return ret;
+}
+#define _mm256_loadu_ps(src) mm256_loadu_ps(src)
+
+
+static inline void mm256_storeu_ps(float *dst, mm256_emu src) {
+  _mm_storeu_ps(dst, src.lo);
+  _mm_storeu_ps(&dst[4], src.hi);
+}
+#define _mm256_storeu_ps(dst, src) mm256_storeu_ps(dst, src)
+
+
+static inline mm256_emu mm256_setzero_ps(void) {
+  mm256_emu ret;
+  ret.lo = _mm_setzero_ps();
+  ret.hi = ret.lo;
+  return ret;
+}
+#define _mm256_setzero_ps mm256_setzero_ps
+
+static inline mm256_emu mm256_broadcast_ss(const float *x) {
+  mm256_emu ret;
+  ret.lo = _mm_set1_ps(*x);
+  ret.hi = ret.lo;
+  return ret;
+}
+#define _mm256_broadcast_ss(x) mm256_broadcast_ss(x)
+
+static inline mm256_emu mm256_set1_ps(float x) {
+  mm256_emu ret;
+  ret.lo = _mm_set1_ps(x);
+  ret.hi = ret.lo;
+  return ret;
+}
+#define _mm256_set1_ps(x) mm256_set1_ps(x)
+
+
+
+static inline mm256_emu mm256_mul_ps(mm256_emu a, mm256_emu b) {
+  mm256_emu ret;
+  ret.lo = _mm_mul_ps(a.lo, b.lo);
+  ret.hi = _mm_mul_ps(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_mul_ps(a,b) mm256_mul_ps(a,b)
+
+static inline mm256_emu mm256_add_ps(mm256_emu a, mm256_emu b) {
+  mm256_emu ret;
+  ret.lo = _mm_add_ps(a.lo, b.lo);
+  ret.hi = _mm_add_ps(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_add_ps(a,b) mm256_add_ps(a,b)
+
+
+static inline mm256_emu mm256_max_ps(mm256_emu a, mm256_emu b) {
+  mm256_emu ret;
+  ret.lo = _mm_max_ps(a.lo, b.lo);
+  ret.hi = _mm_max_ps(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_max_ps(a,b) mm256_max_ps(a,b)
+
+static inline mm256_emu mm256_min_ps(mm256_emu a, mm256_emu b) {
+  mm256_emu ret;
+  ret.lo = _mm_min_ps(a.lo, b.lo);
+  ret.hi = _mm_min_ps(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_min_ps(a,b) mm256_min_ps(a,b)
+
+static inline mm256_emu mm256_rcp_ps(mm256_emu a) {
+  mm256_emu ret;
+  ret.lo = _mm_rcp_ps(a.lo);
+  ret.hi = _mm_rcp_ps(a.hi);
+  return ret;
+}
+#define _mm256_rcp_ps(a) mm256_rcp_ps(a)
+
+
+static inline __m128 mm256_extractf128_ps(mm256_emu x, int i) {
+    return (i==0) ? x.lo : x.hi;
+}
+#undef _mm256_extractf128_ps
+#define _mm256_extractf128_ps(x,i) mm256_extractf128_ps(x,i)
+
+static inline mm256_emu mm256_insertf128_ps(mm256_emu dst, __m128 src, int i) {
+    if (i==0) dst.lo = src;
+    else dst.hi = src;
+    return dst;
+}
+#undef _mm256_insertf128_ps
+#define _mm256_insertf128_ps(dst,src,i) mm256_insertf128_ps(dst,src,i)
+
+#endif /* __AVX__ */
+
+
+
+/* If we don't have AVX2 available, emulate what we need with SSE up to 4.1. */
+#ifndef __AVX2__
+
+typedef struct {
+  __m128i lo;
+  __m128i hi;
+} mm256i_emu;
+typedef __m256i real_m256i;
+#define __m256i mm256i_emu
+
+
+static inline mm256i_emu mm256_loadu_si256(const mm256i_emu *src) {
+  mm256i_emu ret;
+  ret.lo = _mm_loadu_si128((const __m128i*)src);
+  ret.hi = _mm_loadu_si128((const __m128i*)(&((const char *)src)[16]));
+  return ret;
+}
+#define _mm256_loadu_si256(src) mm256_loadu_si256(src)
+
+
+static inline void mm256_storeu_si256(mm256i_emu *dst, mm256i_emu src) {
+  _mm_storeu_si128((__m128i*)dst, src.lo);
+  _mm_storeu_si128((__m128i*)(&((char *)dst)[16]), src.hi);
+}
+#define _mm256_storeu_si256(dst, src) mm256_storeu_si256(dst, src)
+
+
+static inline mm256i_emu mm256_set1_epi32(int x) {
+  mm256i_emu ret;
+  ret.lo = _mm_set1_epi32(x);
+  ret.hi = ret.lo;
+  return ret;
+}
+#define _mm256_set1_epi32(x) mm256_set1_epi32(x)
+
+static inline mm256i_emu mm256_set1_epi16(int x) {
+  mm256i_emu ret;
+  ret.lo = _mm_set1_epi16(x);
+  ret.hi = ret.lo;
+  return ret;
+}
+#define _mm256_set1_epi16(x) mm256_set1_epi16(x)
+
+
+static inline mm256i_emu mm256_add_epi32(mm256i_emu a, mm256i_emu b) {
+  mm256i_emu ret;
+  ret.lo = _mm_add_epi32(a.lo, b.lo);
+  ret.hi = _mm_add_epi32(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_add_epi32(a,b) mm256_add_epi32(a,b)
+
+static inline mm256i_emu mm256_madd_epi16(mm256i_emu a, mm256i_emu b) {
+  mm256i_emu ret;
+  ret.lo = _mm_madd_epi16(a.lo, b.lo);
+  ret.hi = _mm_madd_epi16(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_madd_epi16(a,b) mm256_madd_epi16(a,b)
+
+static inline mm256i_emu mm256_maddubs_epi16(mm256i_emu a, mm256i_emu b) {
+  mm256i_emu ret;
+  ret.lo = _mm_maddubs_epi16(a.lo, b.lo);
+  ret.hi = _mm_maddubs_epi16(a.hi, b.hi);
+  return ret;
+}
+#define _mm256_maddubs_epi16(a,b) mm256_maddubs_epi16(a,b)
+
+
+
+/* Emulating the conversion functions is tricky because they use __m256i but are defined in AVX.
+   So we need to make a special when only AVX is available. */
+#ifdef __AVX__
+
+typedef union {
+  mm256i_emu fake;
+  real_m256i real;
+} mm256_union;
+
+static inline __m256 mm256_cvtepi32_ps(mm256i_emu a) {
+  mm256_union src;
+  src.fake = a;
+  return _mm256_cvtepi32_ps(src.real);
+}
+#define _mm256_cvtepi32_ps(a) mm256_cvtepi32_ps(a)
+
+static inline mm256i_emu mm256_cvtps_epi32(__m256 a) {
+  mm256_union ret;
+  ret.real =   _mm256_cvtps_epi32(a);
+  return ret.fake;
+}
+#define _mm256_cvtps_epi32(a) mm256_cvtps_epi32(a)
+
+
+#else
+
+static inline mm256_emu mm256_cvtepi32_ps(mm256i_emu a) {
+  mm256_emu ret;
+  ret.lo = _mm_cvtepi32_ps(a.lo);
+  ret.hi = _mm_cvtepi32_ps(a.hi);
+  return ret;
+}
+#define _mm256_cvtepi32_ps(a) mm256_cvtepi32_ps(a)
+
+static inline mm256i_emu mm256_cvtps_epi32(mm256_emu a) {
+  mm256i_emu ret;
+  ret.lo = _mm_cvtps_epi32(a.lo);
+  ret.hi = _mm_cvtps_epi32(a.hi);
+  return ret;
+}
+#define _mm256_cvtps_epi32(a) mm256_cvtps_epi32(a)
+
+#endif /* __AVX__ */
+
+
+#endif /* __AVX2__ */
+
+/* In case we don't have FMA, make it a mul and an add. */
+#if !(defined(__FMA__) && defined(__AVX__))
+#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
+#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
+#endif
+
+#ifdef __AVX2__
+static inline __m256 exp8_approx(__m256 X)
+{
+   const __m256 K0 = _mm256_set1_ps(0.99992522f);
+   const __m256 K1 = _mm256_set1_ps(0.69583354f);
+   const __m256 K2 = _mm256_set1_ps(0.22606716f);
+   const __m256 K3 = _mm256_set1_ps(0.078024523f);
+   const __m256 log2_E = _mm256_set1_ps(1.44269504f);
+   const __m256 max_in = _mm256_set1_ps(50.f);
+   const __m256 min_in = _mm256_set1_ps(-50.f);
+   __m256 XF, Y;
+   __m256i I;
+   X = _mm256_mul_ps(X, log2_E);
+   X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));
+   XF = _mm256_floor_ps(X);
+   I = _mm256_cvtps_epi32(XF);
+   X = _mm256_sub_ps(X, XF);
+   Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
+   I = _mm256_slli_epi32(I, 23);
+   Y = _mm256_castsi256_ps(_mm256_add_epi32(I, _mm256_castps_si256(Y)));
+   return Y;
+}
+
+static inline void vector_ps_to_epi8(unsigned char *x, const float *_x, int len) {
+    int i;
+   __m256 const127 = _mm256_set1_ps(127.f);
+    for (i=0;i<len;i+=8) {
+       __m256 xf;
+       __m256i xi;
+       xf = _mm256_loadu_ps(&_x[i]);
+       xf = _mm256_fmadd_ps(xf, const127, const127);
+       xi = _mm256_cvtps_epi32(xf);
+       xi = _mm256_packus_epi32(xi,  _mm256_setzero_si256());
+       xi = _mm256_permute4x64_epi64(xi, 0xD8);
+       xi = _mm256_packus_epi16(xi, _mm256_setzero_si256());
+       xi = _mm256_permutevar8x32_epi32(xi, _mm256_setr_epi32(0,1, 0,0, 0,0, 0,0));
+       _mm256_storeu_si256 ((__m256i *)&x[i], xi);
+   }
+}
+
+#else
+static inline __m128 exp4_approx(__m128 X)
+{
+   const __m128 K0 = _mm_set1_ps(0.99992522f);
+   const __m128 K1 = _mm_set1_ps(0.69583354f);
+   const __m128 K2 = _mm_set1_ps(0.22606716f);
+   const __m128 K3 = _mm_set1_ps(0.078024523f);
+   const __m128 log2_E = _mm_set1_ps(1.44269504);
+   const __m128 max_in = _mm_set1_ps(50.f);
+   const __m128 min_in = _mm_set1_ps(-50.f);
+   const __m128i mask = _mm_set1_epi32(0x7fffffff);
+   __m128 XF, Y;
+   __m128i I;
+   X = _mm_mul_ps(X, log2_E);
+   X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));
+   XF = _mm_floor_ps(X);
+   I = _mm_cvtps_epi32(XF);
+   X = _mm_sub_ps(X, XF);
+   Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);
+   I = _mm_slli_epi32(I, 23);
+   Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
+   return Y;
+}
+static inline __m256 exp8_approx(__m256 X)
+{
+   __m256 Y;
+   __m128 Xhi, Xlo, Yhi, Ylo;
+   Xhi = _mm256_extractf128_ps(X, 1);
+   Xlo = _mm256_extractf128_ps(X, 0);
+   Yhi = exp4_approx(Xhi);
+   Ylo = exp4_approx(Xlo);
+   Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);
+   Y = _mm256_insertf128_ps(Y, Ylo, 0);
+   return Y;
+}
+
+static inline void vector_ps_to_epi8(unsigned char *x, const float *_x, int len) {
+    int i;
+    for (i=0;i<len;i++) x[i] = 127+floor(.5+127*_x[i]);
+}
+
+#endif
+
+
+#ifdef __AVX__
+
+/* Approximating tanh() using a Padé-like rational function:
+   tanh(x) ~= x * (N0 + N1*x^2 + N2*x^4)/(D0 + D1*x^2 + D2*x^4)
+   subject to the +/- 1 bounds.
+   The coefficients were determined by gradient descent trying to minimize
+   the maximum deviation over the whole range (this is only possible because
+   of the bounds). The max error is around 3e-4 and is dominated by the
+   reciprocal approximation (the max error of the rational function is
+   around 6e-5).
+   */
+static inline __m256 tanh8_approx(__m256 X)
+{
+   const __m256 N0 = _mm256_set1_ps(952.52801514f);
+   const __m256 N1 = _mm256_set1_ps(96.39235687f);
+   const __m256 N2 = _mm256_set1_ps(0.60863042f);
+   const __m256 D0 = _mm256_set1_ps(952.72399902f);
+   const __m256 D1 = _mm256_set1_ps(413.36801147f);
+   const __m256 D2 = _mm256_set1_ps(11.88600922f);
+   const __m256 max_out = _mm256_set1_ps(1.f);
+   const __m256 min_out = _mm256_set1_ps(-1.f);
+   __m256 X2, num, den;
+   X2 = _mm256_mul_ps(X, X);
+   num = _mm256_fmadd_ps(_mm256_fmadd_ps(N2, X2, N1), X2, N0);
+   den = _mm256_fmadd_ps(_mm256_fmadd_ps(D2, X2, D1), X2, D0);
+   num = _mm256_mul_ps(num, X);
+   den = _mm256_rcp_ps(den);
+   num = _mm256_mul_ps(num, den);
+   return _mm256_max_ps(min_out, _mm256_min_ps(max_out, num));
+}
+
+/* Sigmoid approximation using a Padé-like rational function:
+   1/(1+exp(-x)) ~= 0.5 + x * (N0 + N1*x^2 + N2*x^4)/(D0 + D1*x^2 + D2*x^4)
+   subject to the [0, 1] bounds.
+   The coefficients are directly derived by dividing the tanh() coefficients
+   by powers of two to get the correct scaling. The max error is around 1.5e-4
+   and is dominated by the reciprocal approximation (the max error of the
+   rational function is around 3e-5).
+   */
+static inline __m256 sigmoid8_approx(__m256 X)
+{
+   const __m256 N0 = _mm256_set1_ps(238.13200378f);
+   const __m256 N1 = _mm256_set1_ps(6.02452230f);
+   const __m256 N2 = _mm256_set1_ps(0.00950985f);
+   const __m256 D0 = _mm256_set1_ps(952.72399902f);
+   const __m256 D1 = _mm256_set1_ps(103.34200287f);
+   const __m256 D2 = _mm256_set1_ps(0.74287558f);
+   const __m256 half = _mm256_set1_ps(0.5);
+   const __m256 max_out = _mm256_set1_ps(1.f);
+   const __m256 min_out = _mm256_set1_ps(0.f);
+   __m256 X2, num, den;
+   X2 = _mm256_mul_ps(X, X);
+   num = _mm256_fmadd_ps(_mm256_fmadd_ps(N2, X2, N1), X2, N0);
+   den = _mm256_fmadd_ps(_mm256_fmadd_ps(D2, X2, D1), X2, D0);
+   num = _mm256_mul_ps(num, X);
+   den = _mm256_rcp_ps(den);
+   num = _mm256_fmadd_ps(num, den, half);
+   return _mm256_max_ps(min_out, _mm256_min_ps(max_out, num));
+}
+
+static inline float tanh_approx(float x)
+{
+   float out[8];
+   __m256 X, Y;
+   X = _mm256_set1_ps(x);
+   Y = tanh8_approx(X);
+   _mm256_storeu_ps(out, Y);
+   return out[0];
+}
+
+static inline float sigmoid_approx(float x)
+{
+   float out[8];
+   __m256 X, Y;
+   X = _mm256_set1_ps(x);
+   Y = sigmoid8_approx(X);
+   _mm256_storeu_ps(out, Y);
+   return out[0];
+}
+
+#else
+
+static inline __m128 tanh4_approx(__m128 X)
+{
+   const __m128 N0 = _mm_set1_ps(952.52801514f);
+   const __m128 N1 = _mm_set1_ps(96.39235687f);
+   const __m128 N2 = _mm_set1_ps(0.60863042f);
+   const __m128 D0 = _mm_set1_ps(952.72399902f);
+   const __m128 D1 = _mm_set1_ps(413.36801147f);
+   const __m128 D2 = _mm_set1_ps(11.88600922f);
+   const __m128 max_out = _mm_set1_ps(1.f);
+   const __m128 min_out = _mm_set1_ps(-1.f);
+   __m128 X2, num, den;
+   X2 = _mm_mul_ps(X, X);
+   num = _mm_fmadd_ps(_mm_fmadd_ps(N2, X2, N1), X2, N0);
+   den = _mm_fmadd_ps(_mm_fmadd_ps(D2, X2, D1), X2, D0);
+   num = _mm_mul_ps(num, X);
+   den = _mm_rcp_ps(den);
+   num = _mm_mul_ps(num, den);
+   return _mm_max_ps(min_out, _mm_min_ps(max_out, num));
+}
+
+static inline __m128 sigmoid4_approx(__m128 X)
+{
+   const __m128 N0 = _mm_set1_ps(238.13200378f);
+   const __m128 N1 = _mm_set1_ps(6.02452230f);
+   const __m128 N2 = _mm_set1_ps(0.00950985f);
+   const __m128 D0 = _mm_set1_ps(952.72399902f);
+   const __m128 D1 = _mm_set1_ps(103.34200287f);
+   const __m128 D2 = _mm_set1_ps(0.74287558f);
+   const __m128 half = _mm_set1_ps(0.5);
+   const __m128 max_out = _mm_set1_ps(1.f);
+   const __m128 min_out = _mm_set1_ps(0.f);
+   __m128 X2, num, den;
+   X2 = _mm_mul_ps(X, X);
+   num = _mm_fmadd_ps(_mm_fmadd_ps(N2, X2, N1), X2, N0);
+   den = _mm_fmadd_ps(_mm_fmadd_ps(D2, X2, D1), X2, D0);
+   num = _mm_mul_ps(num, X);
+   den = _mm_rcp_ps(den);
+   num = _mm_fmadd_ps(num, den, half);
+   return _mm_max_ps(min_out, _mm_min_ps(max_out, num));
+}
+
+static inline float tanh_approx(float x)
+{
+   float out[4];
+   __m128 X, Y;
+   X = _mm_set1_ps(x);
+   Y = tanh4_approx(X);
+   _mm_storeu_ps(out, Y);
+   return out[0];
+}
+
+static inline float sigmoid_approx(float x)
+{
+   float out[4];
+   __m128 X, Y;
+   X = _mm_set1_ps(x);
+   Y = sigmoid4_approx(X);
+   _mm_storeu_ps(out, Y);
+   return out[0];
+}
+
+#endif
+
+static inline float lpcnet_exp(float x)
+{
+   float out[8];
+   __m256 X, Y;
+   X = _mm256_set1_ps(x);
+   Y = exp8_approx(X);
+   _mm256_storeu_ps(out, Y);
+   return out[0];
+}
+
+static inline void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-7;i+=8)
+    {
+        __m256 X, Y;
+        X = _mm256_loadu_ps(&x[i]);
+        Y = exp8_approx(X);
+        _mm256_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+        y[i] = lpcnet_exp(x[i]);
+}
+
+#ifdef __AVX__
+static inline void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-7;i+=8)
+    {
+        __m256 X, Y;
+        X = _mm256_loadu_ps(&x[i]);
+        Y = tanh8_approx(X);
+        _mm256_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        y[i] = tanh_approx(x[i]);
+    }
+}
+
+static inline void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-7;i+=8)
+    {
+        __m256 X, Y;
+        X = _mm256_loadu_ps(&x[i]);
+        Y = sigmoid8_approx(X);
+        _mm256_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        y[i] = sigmoid_approx(x[i]);
+    }
+}
+#else
+static inline void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-3;i+=4)
+    {
+        __m128 X, Y;
+        X = _mm_loadu_ps(&x[i]);
+        Y = tanh4_approx(X);
+        _mm_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        y[i] = tanh_approx(x[i]);
+    }
+}
+
+static inline void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-3;i+=4)
+    {
+        __m128 X, Y;
+        X = _mm_loadu_ps(&x[i]);
+        Y = sigmoid4_approx(X);
+        _mm_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        y[i] = sigmoid_approx(x[i]);
+    }
+}
+
+#endif
+
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float *y;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      for (j=0;j<cols;j++)
+      {
+         __m256 vxj;
+         __m256 vw;
+         vxj = _mm256_broadcast_ss(&x[j]);
+
+         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+static inline void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float *y;
+      int cols;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int id;
+         __m256 vxj;
+         __m256 vw;
+         id = *idx++;
+         vxj = _mm256_broadcast_ss(&x[id]);
+
+         vw = _mm256_loadu_ps(&weights[0]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+         weights += 16;
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+
+#ifdef DOT_PROD
+#define USE_SU_BIAS
+
+typedef signed char qweight;
+
+
+#define MAX_INPUTS (2048)
+#define MAX_OUTPUTS (8192)
+
+
+#define SCALE (128.f*127.f)
+#define SCALE_1 (1.f/128.f/127.f)
+
+#if 1
+static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{
+   __m256i ones;
+   int i, j;
+   unsigned char x[MAX_INPUTS];
+   (void)col_stride;
+   ones = _mm256_set1_epi16(1);
+   /*for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);*/
+   vector_ps_to_epi8(x, _x, cols);
+   for (i=0;i<rows;i+=8)
+   {
+      __m256i vy0;
+      __m256 vout;
+      vout = _mm256_loadu_ps(&_out[i]);
+      vout = _mm256_mul_ps(vout, _mm256_set1_ps(SCALE));
+      vy0 = _mm256_cvtps_epi32(vout);
+      j=0;
+#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
+      for (;j<cols-12;j+=16)
+      {
+         __m256i tmp;
+         __m256i vxj;
+         __m256i vw;
+         vxj = _mm256_set1_epi32(*(int*)&x[j]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+         vxj = _mm256_set1_epi32(*(int*)&x[j+4]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+         vxj = _mm256_set1_epi32(*(int*)&x[j+8]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+         vxj = _mm256_set1_epi32(*(int*)&x[j+12]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+      }
+#endif
+      for (;j<cols;j+=4)
+      {
+         __m256i tmp;
+         __m256i vxj;
+         __m256i vw;
+         vxj = _mm256_set1_epi32(*(int*)&x[j]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+      }
+      vout = _mm256_cvtepi32_ps(vy0);
+      vout = _mm256_mul_ps(vout, _mm256_set1_ps(SCALE_1));
+      _mm256_storeu_ps(&_out[i], vout);
+   }
+}
+#else
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{
+   int i, j;
+   unsigned char x[MAX_INPUTS];
+   (void)col_stride;
+   for (i=0;i<rows;i++) out[i] *= SCALE;
+   for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);
+   for (i=0;i<rows;i+=8)
+   {
+      for (j=0;j<cols;j+=4)
+      {
+         float *y;
+         float xj0, xj1, xj2, xj3;
+         xj0 = x[j+0];
+         xj1 = x[j+1];
+         xj2 = x[j+2];
+         xj3 = x[j+3];
+         y = &out[i];
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         w += 32;
+      }
+   }
+   for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#endif
+
+static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{
+   __m256i ones;
+   int i, j;
+   unsigned char x[MAX_INPUTS];
+   ones = _mm256_set1_epi16(1);
+   /*for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);*/
+   vector_ps_to_epi8(x, _x, cols);
+   for (i=0;i<rows;i+=8)
+   {
+      int colblocks;
+      __m256i vy0;
+      __m256 vout;
+      colblocks = *idx++;
+      vout = _mm256_loadu_ps(&_out[i]);
+      vout = _mm256_mul_ps(vout, _mm256_set1_ps(SCALE));
+      vy0 = _mm256_cvtps_epi32(vout);
+      j=0;
+#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
+      for (;j<colblocks-3;j+=4)
+      {
+         __m256i tmp;
+         __m256i vxj;
+         __m256i vw;
+         vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+         vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+         vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+         vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+      }
+#endif
+      for (;j<colblocks;j++)
+      {
+         __m256i tmp;
+         __m256i vxj;
+         __m256i vw;
+         int pos;
+         pos = (*idx++);
+         vxj = _mm256_set1_epi32(*(int*)&x[pos]);
+         vw = _mm256_loadu_si256((const __m256i *)w);
+         tmp = _mm256_maddubs_epi16(vxj, vw);
+         tmp = _mm256_madd_epi16(tmp, ones);
+         vy0 = _mm256_add_epi32(vy0, tmp);
+         w += 32;
+      }
+      vout = _mm256_cvtepi32_ps(vy0);
+      vout = _mm256_mul_ps(vout, _mm256_set1_ps(SCALE_1));
+      _mm256_storeu_ps(&_out[i], vout);
+   }
+}
+
+
+#else /*DOT_PROD*/
+typedef float qweight;
+#define sgemv_accum8x4 sgemv_accum
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)
+{
+   int i, j;
+   (void)ignore;
+   for (i=0;i<rows;i+=8)
+   {
+      float *y;
+      int cols;
+      __m256 vy0;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int id;
+         __m256 vxj;
+         __m256 vw;
+         id = *idx++;
+         vxj = _mm256_broadcast_ss(&x[id]);
+         vw = _mm256_loadu_ps(&weights[0]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vxj = _mm256_broadcast_ss(&x[id+1]);
+         vw = _mm256_loadu_ps(&weights[8]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vxj = _mm256_broadcast_ss(&x[id+2]);
+         vw = _mm256_loadu_ps(&weights[16]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vxj = _mm256_broadcast_ss(&x[id+3]);
+         vw = _mm256_loadu_ps(&weights[24]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         weights += 32;
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+   }
+}
+#endif /*DOT_PROD*/
+
+#endif /*VEC_AVX_H*/
diff --git a/dnn/vec_neon.h b/dnn/vec_neon.h
new file mode 100644
index 00000000..b21d3896
--- /dev/null
+++ b/dnn/vec_neon.h
@@ -0,0 +1,373 @@
+/* Copyright (c) 2018 David Rowe
+                 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* NEON support for ARM machines */
+
+#ifndef VEC_NEON_H
+#define VEC_NEON_H
+
+#include <arm_neon.h>
+
+#ifndef DISABLE_DOT_PROD
+#define DOT_PROD
+#endif
+
+#ifdef DOT_PROD
+typedef signed char qweight;
+#else
+typedef float qweight;
+#endif
+
+
+#ifndef LPCNET_TEST
+static inline OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) {
+  int32x4_t i;
+  float32x4_t xf;
+
+  x = vmaxq_f32(vminq_f32(x, vdupq_n_f32(88.f)), vdupq_n_f32(-88.f));
+
+  /* express exp(x) as exp2(x/log(2)), add 127 for the exponent later */
+  x = vmlaq_f32(vdupq_n_f32(127.f), x, vdupq_n_f32(1.44269504f));
+
+  /* split into integer and fractional parts */
+  i = vcvtq_s32_f32(x);
+  xf = vcvtq_f32_s32(i);
+  x = vsubq_f32(x, xf);
+
+  float32x4_t K0 = vdupq_n_f32(0.99992522f);
+  float32x4_t K1 = vdupq_n_f32(0.69583354f);
+  float32x4_t K2 = vdupq_n_f32(0.22606716f);
+  float32x4_t K3 = vdupq_n_f32(0.078024523f);
+  float32x4_t Y = vmlaq_f32(K0, x, vmlaq_f32(K1, x, vmlaq_f32(K2, K3, x)));
+
+  /* compute 2^i */
+  float32x4_t exponent = vreinterpretq_f32_s32(vshlq_n_s32(i, 23));
+
+  Y = vmulq_f32(Y, exponent);
+  return Y;
+}
+
+static inline float32x4_t tanh4_approx(float32x4_t X)
+{
+  const float32x4_t N0 = vdupq_n_f32(952.52801514f);
+  const float32x4_t N1 = vdupq_n_f32(96.39235687f);
+  const float32x4_t N2 = vdupq_n_f32(0.60863042f);
+  const float32x4_t D0 = vdupq_n_f32(952.72399902f);
+  const float32x4_t D1 = vdupq_n_f32(413.36801147f);
+  const float32x4_t D2 = vdupq_n_f32(11.88600922f);
+  const float32x4_t max_out = vdupq_n_f32(1.f);
+  const float32x4_t min_out = vdupq_n_f32(-1.f);
+  float32x4_t X2, num, den;
+  X2 = vmulq_f32(X, X);
+  num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2));
+  den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2));
+  num = vmulq_f32(num, X);
+  den = vrecpeq_f32(den);
+  num = vmulq_f32(num, den);
+  return vmaxq_f32(min_out, vminq_f32(max_out, num));
+}
+
+static inline float32x4_t sigmoid4_approx(float32x4_t X)
+{
+  const float32x4_t N0 = vdupq_n_f32(238.13200378f);
+  const float32x4_t N1 = vdupq_n_f32(6.02452230f);
+  const float32x4_t N2 = vdupq_n_f32(0.00950985f);
+  const float32x4_t D0 = vdupq_n_f32(952.72399902f);
+  const float32x4_t D1 = vdupq_n_f32(103.34200287f);
+  const float32x4_t D2 = vdupq_n_f32(0.74287558f);
+  const float32x4_t half = vdupq_n_f32(0.5f);
+  const float32x4_t max_out = vdupq_n_f32(1.f);
+  const float32x4_t min_out = vdupq_n_f32(0.f);
+  float32x4_t X2, num, den;
+  X2 = vmulq_f32(X, X);
+  num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2));
+  den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2));
+  num = vmulq_f32(num, X);
+  den = vrecpeq_f32(den);
+  num = vmlaq_f32(half, num, den);
+  return vmaxq_f32(min_out, vminq_f32(max_out, num));
+}
+
+static inline float lpcnet_exp(float x)
+{
+   float out[4];
+   float32x4_t X, Y;
+   X = vdupq_n_f32(x);
+   Y = exp4_approx(X);
+   vst1q_f32(out, Y);
+   return out[0];
+}
+
+static inline float tanh_approx(float x)
+{
+   float out[4];
+   float32x4_t X, Y;
+   X = vdupq_n_f32(x);
+   Y = tanh4_approx(X);
+   vst1q_f32(out, Y);
+   return out[0];
+}
+
+static inline float sigmoid_approx(float x)
+{
+   float out[4];
+   float32x4_t X, Y;
+   X = vdupq_n_f32(x);
+   Y = sigmoid4_approx(X);
+   vst1q_f32(out, Y);
+   return out[0];
+}
+
+static inline void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-3;i+=4)
+    {
+        float32x4_t X, Y;
+        X = vld1q_f32(&x[i]);
+        Y = exp4_approx(X);
+        vst1q_f32(&y[i], Y);
+    }
+    for (;i<N;i++)
+        y[i] = lpcnet_exp(x[i]);
+}
+
+static inline void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-3;i+=4)
+    {
+        float32x4_t X, Y;
+        X = vld1q_f32(&x[i]);
+        Y = tanh4_approx(X);
+        vst1q_f32(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        float ex2;
+        ex2 = lpcnet_exp(2*x[i]);
+        y[i] = (ex2-1)/(ex2+1);
+    }
+}
+
+static inline void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-3;i+=4)
+    {
+        float32x4_t X, Y;
+        X = vld1q_f32(&x[i]);
+        Y = sigmoid4_approx(X);
+        vst1q_f32(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        float ex;
+        ex = lpcnet_exp(x[i]);
+        y[i] = (ex)/(ex+1);
+    }
+}
+#endif
+
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+    int i, j;
+    for (i=0;i<rows;i+=16)
+    {
+	float * restrict y = &out[i];
+      
+	/* keep y[0..15] in registers for duration of inner loop */
+      
+	float32x4_t y0_3 = vld1q_f32(&y[0]);
+	float32x4_t y4_7 = vld1q_f32(&y[4]);
+	float32x4_t y8_11 = vld1q_f32(&y[8]);
+	float32x4_t y12_15 = vld1q_f32(&y[12]);
+      
+	for (j=0;j<cols;j++)
+	{
+	    const float * restrict w;
+	    float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+	    float32x4_t xj;
+
+	    w = &weights[j*col_stride + i];
+	    wvec0_3 = vld1q_f32(&w[0]);
+	    wvec4_7 = vld1q_f32(&w[4]);
+	    wvec8_11 = vld1q_f32(&w[8]);
+	    wvec12_15 = vld1q_f32(&w[12]);
+	    
+	    xj = vld1q_dup_f32(&x[j]);
+	 
+	    y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
+	    y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
+	    y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
+	    y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
+	}
+
+	/* save y[0..15] back to memory */
+      
+	vst1q_f32(&y[0], y0_3);
+	vst1q_f32(&y[4], y4_7);
+	vst1q_f32(&y[8], y8_11);
+	vst1q_f32(&y[12], y12_15);
+      
+    }
+}
+
+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+    int i, j;
+    for (i=0;i<rows;i+=16)
+    {
+	int cols;
+	cols = *idx++;
+	float * restrict y;
+	y = &out[i];
+
+	/* keep y[0..15] in registers for duration of inner loop */
+      
+	float32x4_t y0_3 = vld1q_f32(&y[0]);
+	float32x4_t y4_7 = vld1q_f32(&y[4]);
+	float32x4_t y8_11 = vld1q_f32(&y[8]);
+	float32x4_t y12_15 = vld1q_f32(&y[12]);
+      
+	for (j=0;j<cols;j++)
+	{
+	    float32x4_t xj= vld1q_dup_f32(&x[*idx++]);
+	    float32x4_t wvec;
+	 
+	    wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_f32(y0_3, wvec, xj);
+	    wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_f32(y4_7, wvec, xj);
+	    wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_f32(y8_11, wvec, xj);
+	    wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_f32(y12_15, wvec, xj);
+	 
+	    w += 16;
+	}
+
+	/* save y[0..15] back to memory */
+      
+	vst1q_f32(&y[0], y0_3);
+	vst1q_f32(&y[4], y4_7);
+	vst1q_f32(&y[8], y8_11);
+	vst1q_f32(&y[12], y12_15);
+      
+    }
+}
+
+#define SCALE (128.f*127.f)
+#define SCALE_1 (1.f/128.f/127.f)
+
+#define MAX_INPUTS 2048
+#define MAX_OUTPUTS 8192
+
+#if __ARM_FEATURE_DOTPROD
+static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) {
+  return vdotq_s32(acc, a, b);
+}
+#else
+static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b)
+{
+  return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)),  vmull_high_s8(a, b)));
+}
+#endif
+
+static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{
+   int i, j;
+   signed char x[MAX_INPUTS];
+   const float32x4_t scale = vdupq_n_f32(SCALE);
+   const float32x4_t scale_1 = vdupq_n_f32(SCALE_1);
+   const float32x4_t const127 = vdupq_n_f32(127.);
+   (void)col_stride;
+   for (i=0;i<cols;i+=8) {
+      int32x4_t xi0, xi4;
+      int16x8_t x_short;
+      xi0 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i])));
+      xi4 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i+4])));
+      x_short = vcombine_s16(vmovn_s32(xi0), vmovn_s32(xi4));
+      vst1_s8(&x[i], vmovn_s16(x_short));
+   }
+   for (i=0;i<rows;i+=8)
+   {
+      int32x4_t acc0, acc1;
+      acc0 = vcvtnq_s32_f32(vmulq_f32(scale, vld1q_f32(&_out[i])));
+      acc1 = vcvtnq_s32_f32(vmulq_f32(scale, vld1q_f32(&_out[i+4])));
+      for (j=0;j<cols;j+=4)
+      {
+         int8x16_t vw0, vw1, vx;
+         vx = (int8x16_t)vld1q_dup_s32((int*)&x[j]);
+         vw0 = vld1q_s8(w);
+         vw1 = vld1q_s8(&w[16]);
+         acc0 = vdotprod(acc0, vw0, vx);
+         acc1 = vdotprod(acc1, vw1, vx);
+         w += 32;
+      }
+      vst1q_f32(&_out[i], vmulq_f32(scale_1, vcvtq_f32_s32(acc0)));
+      vst1q_f32(&_out[i+4], vmulq_f32(scale_1, vcvtq_f32_s32(acc1)));
+   }
+}
+
+static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{
+   int i, j;
+   signed char x[MAX_INPUTS];
+   const float32x4_t scale = vdupq_n_f32(SCALE);
+   const float32x4_t scale_1 = vdupq_n_f32(SCALE_1);
+   const float32x4_t const127 = vdupq_n_f32(127.);
+   for (i=0;i<cols;i+=8) {
+      int32x4_t xi0, xi4;
+      int16x8_t x_short;
+      xi0 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i])));
+      xi4 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i+4])));
+      x_short = vcombine_s16(vmovn_s32(xi0), vmovn_s32(xi4));
+      vst1_s8(&x[i], vmovn_s16(x_short));
+   }
+   for (i=0;i<rows;i+=8)
+   {
+      int colblocks;
+      int32x4_t acc0, acc1;
+      acc0 = vcvtnq_s32_f32(vmulq_f32(scale, vld1q_f32(&_out[i])));
+      acc1 = vcvtnq_s32_f32(vmulq_f32(scale, vld1q_f32(&_out[i+4])));
+      colblocks = *idx++;
+      for (j=0;j<colblocks;j++)
+      {
+         int pos;
+         pos = (*idx++);
+         int8x16_t vw0, vw1, vx;
+         vx = (int8x16_t)vld1q_dup_s32((int*)&x[pos]);
+         vw0 = vld1q_s8(w);
+         vw1 = vld1q_s8(&w[16]);
+         acc0 = vdotprod(acc0, vw0, vx);
+         acc1 = vdotprod(acc1, vw1, vx);
+         w += 32;
+      }
+      vst1q_f32(&_out[i], vmulq_f32(scale_1, vcvtq_f32_s32(acc0)));
+      vst1q_f32(&_out[i+4], vmulq_f32(scale_1, vcvtq_f32_s32(acc1)));
+   }
+}
+
+#endif
diff --git a/dnn/write_lpcnet_weights.c b/dnn/write_lpcnet_weights.c
new file mode 100644
index 00000000..15c20837
--- /dev/null
+++ b/dnn/write_lpcnet_weights.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include "nnet.h"
+#include "common.h"
+
+/* This is a bit of a hack because we need to build nnet_data.c and plc_data.c without USE_WEIGHTS_FILE,
+   but USE_WEIGHTS_FILE is defined in config.h. */
+#undef HAVE_CONFIG_H
+#ifdef USE_WEIGHTS_FILE
+#undef USE_WEIGHTS_FILE
+#endif
+#include "nnet_data.c"
+#include "plc_data.c"
+#include "dred_rdovae_enc_data.c"
+#include "dred_rdovae_dec_data.c"
+
+void write_weights(const WeightArray *list, FILE *fout)
+{
+  int i=0;
+  unsigned char zeros[WEIGHT_BLOCK_SIZE] = {0};
+  while (list[i].name != NULL) {
+    WeightHead h;
+    memcpy(h.head, "DNNw", 4);
+    h.version = WEIGHT_BLOB_VERSION;
+    h.type = list[i].type;
+    h.size = list[i].size;
+    h.block_size = (h.size+WEIGHT_BLOCK_SIZE-1)/WEIGHT_BLOCK_SIZE*WEIGHT_BLOCK_SIZE;
+    RNN_CLEAR(h.name, sizeof(h.name));
+    strncpy(h.name, list[i].name, sizeof(h.name));
+    h.name[sizeof(h.name)-1] = 0;
+    celt_assert(sizeof(h) == WEIGHT_BLOCK_SIZE);
+    fwrite(&h, 1, WEIGHT_BLOCK_SIZE, fout);
+    fwrite(list[i].data, 1, h.size, fout);
+    fwrite(zeros, 1, h.block_size-h.size, fout);
+    i++;
+  }
+}
+
+int main(void)
+{
+  FILE *fout = fopen("weights_blob.bin", "w");
+  write_weights(lpcnet_arrays, fout);
+  write_weights(lpcnet_plc_arrays, fout);
+  write_weights(rdovae_enc_arrays, fout);
+  write_weights(rdovae_dec_arrays, fout);
+  fclose(fout);
+  return 0;
+}
author	Jean-Marc Valin <jmvalin@amazon.com>	2023-06-21 20:08:08 +0300
committer	Jean-Marc Valin <jmvalin@amazon.com>	2023-06-21 20:08:08 +0300
commit	7458c1bdc667564764d630f8ea08933a9c5e9319 (patch)
tree	54dbce728f012af69e2ee37ab6196425aeee1e0a
parent	7dae7b178ca8e6837429d8ed97464ed2ce4e1720 (diff)
parent	3bd86da9a91025da563f1ab1d9b8c1f93ecf6c29 (diff)