Big update in the multi-channel AEC to bring it up-to-date with the single

channel AEC. Mainly this means: 1) dual-path adaptive filter 2) Adaptive (pseudo-proportional) learning rate for different taps 3) API change 4) Other minor details Merge commit 'd2cddf7e2f3c1a75265c43cabaa391037c830745' into stereo Conflicts: include/speex/speex_echo.h libspeex/mdf.c libspeex/testecho.c
author: Jean-Marc Valin <Jean-Marc.Valin@csiro.au> 2007-05-04 09:11:18 +0400
committer: Jean-Marc Valin <Jean-Marc.Valin@csiro.au> 2008-05-19 08:53:14 +0400
commit: 3d7a6f0bd0a60145d8ac3a2f4037da623f407fba (patch)
tree: f90d32540ec3269ef8a405a2e97daaf2f83ffcab /libspeex
parent: 6bd022014a21ecca9c27d6041397009a5933ac39 (diff)
parent: d2cddf7e2f3c1a75265c43cabaa391037c830745 (diff)
52 files changed, 4726 insertions, 3019 deletions
diff --git a/libspeex/Makefile.am b/libspeex/Makefile.am
index a784002..ff6d4bc 100644
--- a/libspeex/Makefile.am
+++ b/libspeex/Makefile.am
@@ -2,7 +2,7 @@
 #AUTOMAKE_OPTIONS = no-dependencies
 
 
-EXTRA_DIST=testenc.c testenc_wb.c testenc_uwb.c testdenoise.c testecho.c
+EXTRA_DIST=echo_diagnostic.m
 
 INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_builddir) @OGG_CFLAGS@
 
@@ -16,7 +16,7 @@ libspeex_la_SOURCES = nb_celp.c 	sb_celp.c 	lpc.c 	ltp.c 	lsp.c 	quant_lsp.c \
 				exc_10_16_table.c 	exc_20_32_table.c 	hexc_10_32_table.c 	misc.c 	speex_header.c \
 				speex_callbacks.c 	math_approx.c 	stereo.c 	preprocess.c 	smallft.c 	lbr_48k_tables.c \
 				jitter.c 	mdf.c vorbis_psy.c fftwrap.c kiss_fft.c _kiss_fft_guts.h kiss_fft.h \
-	kiss_fftr.c kiss_fftr.h window.c
+	kiss_fftr.c kiss_fftr.h window.c filterbank.c resample.c
 
 noinst_HEADERS = lsp.h 	nb_celp.h 	lpc.h 	lpc_bfin.h 	ltp.h 	quant_lsp.h \
 				cb_search.h 	filters.h 	stack_alloc.h 	vq.h 	vq_sse.h 	vq_arm4.h 	vq_bfin.h \
@@ -24,19 +24,19 @@ noinst_HEADERS = lsp.h 	nb_celp.h 	lpc.h 	lpc_bfin.h 	ltp.h 	quant_lsp.h \
 				ltp_bfin.h 	filters_sse.h 	filters_arm4.h 	filters_bfin.h 	math_approx.h \
 				smallft.h 	arch.h 	fixed_arm4.h 	fixed_arm5e.h 	fixed_bfin.h 	fixed_debug.h \
 				fixed_generic.h 	cb_search_sse.h 	cb_search_arm4.h 	cb_search_bfin.h vorbis_psy.h \
-		fftwrap.h pseudofloat.h lsp_bfin.h quant_lsp_bfin.h
+		fftwrap.h pseudofloat.h lsp_bfin.h quant_lsp_bfin.h filterbank.h
 
 
 libspeex_la_LDFLAGS = -no-undefined -version-info @SPEEX_LT_CURRENT@:@SPEEX_LT_REVISION@:@SPEEX_LT_AGE@
 
 noinst_PROGRAMS = testenc testenc_wb testenc_uwb testdenoise testecho
 testenc_SOURCES = testenc.c
-testenc_LDADD = $(top_builddir)/libspeex/libspeex.la
+testenc_LDADD = libspeex.la
 testenc_wb_SOURCES = testenc_wb.c
-testenc_wb_LDADD = $(top_builddir)/libspeex/libspeex.la
+testenc_wb_LDADD = libspeex.la
 testenc_uwb_SOURCES = testenc_uwb.c
-testenc_uwb_LDADD = $(top_builddir)/libspeex/libspeex.la
+testenc_uwb_LDADD = libspeex.la
 testdenoise_SOURCES = testdenoise.c
-testdenoise_LDADD = $(top_builddir)/libspeex/libspeex.la
+testdenoise_LDADD = libspeex.la
 testecho_SOURCES = testecho.c
-testecho_LDADD = $(top_builddir)/libspeex/libspeex.la
+testecho_LDADD = libspeex.la
diff --git a/libspeex/_kiss_fft_guts.h b/libspeex/_kiss_fft_guts.h
index 72acee1..526a73b 100644
--- a/libspeex/_kiss_fft_guts.h
+++ b/libspeex/_kiss_fft_guts.h
@@ -20,6 +20,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
    and defines
    typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
 #include "kiss_fft.h"
+#include "math_approx.h"
 
 #define MAXFACTORS 32
 /* e.g. an fft of length 128 has 4 factors 
@@ -67,6 +68,10 @@ struct kiss_fft_state{
       do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
           (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
 
+#   define C_MUL4(m,a,b) \
+               do{ (m).r = PSHR32( smul((a).r,(b).r) - smul((a).i,(b).i),17 ); \
+               (m).i = PSHR32( smul((a).r,(b).i) + smul((a).i,(b).r),17 ); }while(0)
+
 #   define DIVSCALAR(x,k) \
 	(x) = sround( smul(  x, SAMP_MAX/k ) )
 
@@ -84,6 +89,9 @@ struct kiss_fft_state{
 #define C_MUL(m,a,b) \
     do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
         (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
+
+#define C_MUL4(m,a,b) C_MUL(m,a,b)
+
 #   define C_FIXDIV(c,div) /* NOOP */
 #   define C_MULBYSCALAR( c, s ) \
     do{ (c).r *= (s);\
@@ -140,6 +148,11 @@ struct kiss_fft_state{
 		(x)->r = KISS_FFT_COS(phase);\
 		(x)->i = KISS_FFT_SIN(phase);\
 	}while(0)
+#define  kf_cexp2(x,phase) \
+               do{ \
+               (x)->r = spx_cos_norm((phase));\
+               (x)->i = spx_cos_norm((phase)-32768);\
+}while(0)
 
 
 /* a debugging function */
diff --git a/libspeex/arch.h b/libspeex/arch.h
index 0500437..e2d731a 100644
--- a/libspeex/arch.h
+++ b/libspeex/arch.h
@@ -35,12 +35,16 @@
 #ifndef ARCH_H
 #define ARCH_H
 
+#ifndef OUTSIDE_SPEEX
 #include "speex/speex_types.h"
+#endif
 
 #define ABS(x) ((x) < 0 ? (-(x)) : (x))      /**< Absolute integer value. */
 #define ABS16(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 16-bit value.  */
+#define MIN16(a,b) ((a) < (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
 #define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
 #define ABS32(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 32-bit value.  */
+#define MIN32(a,b) ((a) < (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
 #define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
 
 #ifdef FIXED_POINT
@@ -68,6 +72,7 @@ typedef spx_word32_t spx_sig_t;
 #define VERY_SMALL 0
 #define VERY_LARGE32 ((spx_word32_t)2147483647)
 #define VERY_LARGE16 ((spx_word16_t)32767)
+#define Q15_ONE ((spx_word16_t)32767)
 
 
 #ifdef FIXED_DEBUG
@@ -113,6 +118,7 @@ typedef float spx_word32_t;
 #define VERY_SMALL 1e-15f
 #define VERY_LARGE32 1e15f
 #define VERY_LARGE16 1e15f
+#define Q15_ONE ((spx_word16_t)1.f)
 
 #define QCONST16(x,bits) (x)
 #define QCONST32(x,bits) (x)
@@ -127,6 +133,7 @@ typedef float spx_word32_t;
 #define SHL32(a,shift) (a)
 #define PSHR16(a,shift) (a)
 #define PSHR32(a,shift) (a)
+#define VSHR32(a,shift) (a)
 #define SATURATE16(x,a) (x)
 #define SATURATE32(x,a) (x)
 
@@ -147,6 +154,7 @@ typedef float spx_word32_t;
 #define MULT16_32_Q13(a,b)     ((a)*(b))
 #define MULT16_32_Q14(a,b)     ((a)*(b))
 #define MULT16_32_Q15(a,b)     ((a)*(b))
+#define MULT16_32_P15(a,b)     ((a)*(b))
 
 #define MAC16_32_Q11(c,a,b)     ((c)+(a)*(b))
 #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
diff --git a/libspeex/bits.c b/libspeex/bits.c
index 376e804..5c4cb0e 100644
--- a/libspeex/bits.c
+++ b/libspeex/bits.c
@@ -76,6 +76,7 @@ void speex_bits_destroy(SpeexBits *bits)
 
 void speex_bits_reset(SpeexBits *bits)
 {
+   /* We only need to clear the first byte now */
    bits->chars[0]=0;
    bits->nbBits=0;
    bits->charPtr=0;
@@ -96,7 +97,7 @@ void speex_bits_read_from(SpeexBits *bits, char *chars, int len)
    int nchars = len / BYTES_PER_CHAR;
    if (nchars > bits->buf_size)
    {
-      speex_warning_int("Packet is larger than allocated buffer: ", len);
+      speex_notify("Packet is larger than allocated buffer");
       if (bits->owner)
       {
          char *tmp = (char*)speex_realloc(bits->chars, nchars);
@@ -109,7 +110,7 @@ void speex_bits_read_from(SpeexBits *bits, char *chars, int len)
             speex_warning("Could not resize input buffer: truncating input");
          }
       } else {
-         speex_warning("Do not own input buffer: truncating input");
+         speex_warning("Do not own input buffer: truncating oversize input");
          nchars=bits->buf_size;
       }
    }
@@ -158,10 +159,10 @@ void speex_bits_read_whole_bytes(SpeexBits *bits, char *chars, int nbytes)
             bits->chars=tmp;
          } else {
             nchars=bits->buf_size-(bits->nbBits>>LOG2_BITS_PER_CHAR)-1;
-            speex_warning("Could not resize input buffer: truncating input");
+            speex_warning("Could not resize input buffer: truncating oversize input");
          }
       } else {
-         speex_warning("Do not own input buffer: truncating input");
+         speex_warning("Do not own input buffer: truncating oversize input");
          nchars=bits->buf_size;
       }
    }
@@ -222,14 +223,13 @@ void speex_bits_pack(SpeexBits *bits, int data, int nbBits)
 
    if (bits->charPtr+((nbBits+bits->bitPtr)>>LOG2_BITS_PER_CHAR) >= bits->buf_size)
    {
-      speex_warning("Buffer too small to pack bits");
+      speex_notify("Buffer too small to pack bits");
       if (bits->owner)
       {
-	int new_nchars = ((bits->buf_size+5)*3)>>1;
+         int new_nchars = ((bits->buf_size+5)*3)>>1;
          char *tmp = (char*)speex_realloc(bits->chars, new_nchars);
          if (tmp)
          {
-	    speex_memset_bytes(tmp, 0, new_nchars);
             bits->buf_size=new_nchars;
             bits->chars=tmp;
          } else {
diff --git a/libspeex/cb_search.c b/libspeex/cb_search.c
index 5c68826..cab2b71 100644
--- a/libspeex/cb_search.c
+++ b/libspeex/cb_search.c
@@ -181,7 +181,7 @@ int   update_target
                t[subvect_size*i+m] = ADD16(t[subvect_size*i+m], res[m]);
 
 #ifdef FIXED_POINT
-         if (sign)
+         if (sign==1)
          {
             for (j=0;j<subvect_size;j++)
                e[subvect_size*i+j]=SHL32(EXTEND32(shape_cb[rind*subvect_size+j]),SIG_SHIFT-5);
@@ -226,11 +226,13 @@ int   update_target
    /* Update target: only update target if necessary */
    if (update_target)
    {
-      VARDECL(spx_sig_t *r2);
-      ALLOC(r2, nsf, spx_sig_t);
-      syn_percep_zero(e, ak, awk1, awk2, r2, nsf,p, stack);
+      VARDECL(spx_word16_t *r2);
+      ALLOC(r2, nsf, spx_word16_t);
       for (j=0;j<nsf;j++)
-         target[j]=SUB16(target[j],EXTRACT16(PSHR32(r2[j],8)));
+         r2[j] = EXTRACT16(PSHR32(e[j] ,6));
+      syn_percep_zero16(r2, ak, awk1, awk2, r2, nsf,p, stack);
+      for (j=0;j<nsf;j++)
+         target[j]=SUB16(target[j],PSHR16(r2[j],2));
    }
 }
 
@@ -263,7 +265,6 @@ int   update_target
 #endif
    VARDECL(spx_word16_t *t);
    VARDECL(spx_sig_t *e);
-   VARDECL(spx_sig_t *r2);
    VARDECL(spx_word16_t *tmp);
    VARDECL(spx_word32_t *ndist);
    VARDECL(spx_word32_t *odist);
@@ -316,7 +317,6 @@ int   update_target
 #endif
    ALLOC(t, nsf, spx_word16_t);
    ALLOC(e, nsf, spx_sig_t);
-   ALLOC(r2, nsf, spx_sig_t);
    ALLOC(ind, nb_subvect, int);
 
    ALLOC(tmp, 2*N*nsf, spx_word16_t);
@@ -495,9 +495,13 @@ int   update_target
    /* Update target: only update target if necessary */
    if (update_target)
    {
-      syn_percep_zero(e, ak, awk1, awk2, r2, nsf,p, stack);
+      VARDECL(spx_word16_t *r2);
+      ALLOC(r2, nsf, spx_word16_t);
+      for (j=0;j<nsf;j++)
+         r2[j] = EXTRACT16(PSHR32(e[j] ,6));
+      syn_percep_zero16(r2, ak, awk1, awk2, r2, nsf,p, stack);
       for (j=0;j<nsf;j++)
-         target[j]=SUB16(target[j],EXTRACT16(PSHR32(r2[j],8)));
+         target[j]=SUB16(target[j],PSHR16(r2[j],2));
    }
 }
 
@@ -577,14 +581,12 @@ int   update_target
 )
 {
    int i;
-   VARDECL(spx_sig_t *tmp);
-   ALLOC(tmp, nsf, spx_sig_t);
-   for (i=0;i<nsf;i++)
-      tmp[i]=PSHR32(EXTEND32(target[i]),SIG_SHIFT);
-   residue_percep_zero(tmp, ak, awk1, awk2, tmp, nsf, p, stack);
+   VARDECL(spx_word16_t *tmp);
+   ALLOC(tmp, nsf, spx_word16_t);
+   residue_percep_zero16(target, ak, awk1, awk2, tmp, nsf, p, stack);
 
    for (i=0;i<nsf;i++)
-      exc[i]+=tmp[i];
+      exc[i]+=SHL32(EXTEND32(tmp[i]),8);
    for (i=0;i<nsf;i++)
       target[i]=0;
 }
diff --git a/libspeex/cb_search_bfin.h b/libspeex/cb_search_bfin.h
index 52cc4b3..ae9cf83 100644
--- a/libspeex/cb_search_bfin.h
+++ b/libspeex/cb_search_bfin.h
@@ -73,7 +73,10 @@ void compute_weighted_codebook(const signed char *shape_cb, const spx_word16_t *
          :
       : "m" (subvect_size), "m" (shape_cb), "m" (r), "m" (resp), "m" (E)
       : "A0", "P0", "P1", "P2", "P3", "P4", "R0", "R1", "R2", "I0", "I1", "L0", 
-        "L1", "A0", "A1", "memory", "LC0", "LC1"
+        "L1", "A0", "A1", "memory"
+#if !(__GNUC__ == 3)
+         , "LC0", "LC1" /* gcc 3.4 doesn't know about LC registers */
+#endif
       );
       shape_cb += subvect_size;
       resp += subvect_size;
diff --git a/libspeex/echo_diagnostic.m b/libspeex/echo_diagnostic.m
new file mode 100644
index 0000000..aebf390
--- /dev/null
+++ b/libspeex/echo_diagnostic.m
@@ -0,0 +1,72 @@
+% Attempts to diagnose AEC problems from recorded samples
+%
+% out = echo_diagnostic(rec_file, play_file, out_file, tail_length)
+%
+% Computes the full matrix inversion to cancel echo from the 
+% recording 'rec_file' using the far end signal 'play_file' using 
+% a filter length of 'tail_length'. The output is saved to 'out_file'.
+function out = echo_diagnostic(rec_file, play_file, out_file, tail_length)
+
+F=fopen(rec_file,'rb');
+rec=fread(F,Inf,'short');
+fclose (F);
+F=fopen(play_file,'rb');
+play=fread(F,Inf,'short');
+fclose (F);
+
+rec = [rec; zeros(1024,1)];
+play = [play; zeros(1024,1)];
+
+N = length(rec);
+corr = real(ifft(fft(rec).*conj(fft(play))));
+acorr = real(ifft(fft(play).*conj(fft(play))));
+
+[a,b] = max(corr);
+
+if b > N/2
+      b = b-N;
+end
+printf ("Far end to near end delay is %d samples\n", b);
+if (b > .3*tail_length)
+      printf ('This is too much delay, try delaying the far-end signal a bit\n');
+else if (b < 0)
+      printf ('You have a negative delay, the echo canceller has no chance to cancel anything!\n');
+   else
+      printf ('Delay looks OK.\n');
+      end
+   end
+end
+N2 = round(N/2);
+corr1 = real(ifft(fft(rec(1:N2)).*conj(fft(play(1:N2)))));
+corr2 = real(ifft(fft(rec(N2+1:end)).*conj(fft(play(N2+1:end)))));
+
+[a,b1] = max(corr1);
+if b1 > N2/2
+      b1 = b1-N2;
+end
+[a,b2] = max(corr2);
+if b2 > N2/2
+      b2 = b2-N2;
+end
+drift = (b1-b2)/N2;
+printf ('Drift estimate is %f%% (%d samples)\n', 100*drift, b1-b2);
+if abs(b1-b2) < 10
+   printf ('A drift of a few (+-10) samples is normal.\n');
+else
+   if abs(b1-b2) < 30
+      printf ('There may be (not sure) excessive clock drift. Is the capture and playback done on the same soundcard?\n');
+   else
+      printf ('Your clock is drifting! No way the AEC will be able to do anything with that. Most likely, you''re doing capture and playback from two different cards.\n');
+      end
+   end
+end
+acorr(1) = .001+1.00001*acorr(1);
+AtA = toeplitz(acorr(1:tail_length));
+bb = corr(1:tail_length);
+h = AtA\bb;
+
+out = (rec - filter(h, 1, play));
+
+F=fopen(out_file,'w');
+fwrite(F,out,'short');
+fclose (F);
diff --git a/libspeex/fftwrap.c b/libspeex/fftwrap.c
index 43a9b18..35e2d05 100644
--- a/libspeex/fftwrap.c
+++ b/libspeex/fftwrap.c
@@ -64,7 +64,7 @@ static int maximize_range(spx_word16_t *in, spx_word16_t *out, spx_word16_t boun
    }
    for (i=0;i<len;i++)
    {
-      out[i] = in[i] << shift;
+      out[i] = SHL16(in[i], shift);
    }   
    return shift;
 }
@@ -74,7 +74,7 @@ static void renorm_range(spx_word16_t *in, spx_word16_t *out, int shift, int len
    int i;
    for (i=0;i<len;i++)
    {
-      out[i] = (in[i] + (1<<(shift-1))) >> shift;
+      out[i] = PSHR16(in[i], shift);
    }
 }
 #endif
@@ -103,8 +103,8 @@ void spx_fft(void *table, float *in, float *out)
    if (in==out)
    {
       int i;
-      speex_warning("FFT should not be done in-place");
       float scale = 1./((struct drft_lookup *)table)->n;
+      speex_warning("FFT should not be done in-place");
       for (i=0;i<((struct drft_lookup *)table)->n;i++)
          out[i] = scale*in[i];
    } else {
@@ -120,7 +120,6 @@ void spx_ifft(void *table, float *in, float *out)
 {
    if (in==out)
    {
-      int i;
       speex_warning("FFT should not be done in-place");
    } else {
       int i;
@@ -138,7 +137,6 @@ void spx_ifft(void *table, float *in, float *out)
 struct kiss_config {
    kiss_fftr_cfg forward;
    kiss_fftr_cfg backward;
-   kiss_fft_cpx *freq_data;
    int N;
 };
 
@@ -146,7 +144,6 @@ void *spx_fft_init(int size)
 {
    struct kiss_config *table;
    table = (struct kiss_config*)speex_alloc(sizeof(struct kiss_config));
-   table->freq_data = (kiss_fft_cpx*)speex_alloc(sizeof(kiss_fft_cpx)*((size>>1)+1));
    table->forward = kiss_fftr_alloc(size,0,NULL,NULL);
    table->backward = kiss_fftr_alloc(size,1,NULL,NULL);
    table->N = size;
@@ -158,7 +155,6 @@ void spx_fft_destroy(void *table)
    struct kiss_config *t = (struct kiss_config *)table;
    kiss_fftr_free(t->forward);
    kiss_fftr_free(t->backward);
-   speex_free(t->freq_data);
    speex_free(table);
 }
 
@@ -166,18 +162,10 @@ void spx_fft_destroy(void *table)
 
 void spx_fft(void *table, spx_word16_t *in, spx_word16_t *out)
 {
-   int i;
    int shift;
    struct kiss_config *t = (struct kiss_config *)table;
    shift = maximize_range(in, in, 32000, t->N);
-   kiss_fftr(t->forward, in, t->freq_data);
-   out[0] = t->freq_data[0].r;
-   for (i=1;i<t->N>>1;i++)
-   {
-      out[(i<<1)-1] = t->freq_data[i].r;
-      out[(i<<1)] = t->freq_data[i].i;
-   }
-   out[(i<<1)-1] = t->freq_data[i].r;
+   kiss_fftr2(t->forward, in, out);
    renorm_range(in, in, shift, t->N);
    renorm_range(out, out, shift, t->N);
 }
@@ -190,32 +178,16 @@ void spx_fft(void *table, spx_word16_t *in, spx_word16_t *out)
    float scale;
    struct kiss_config *t = (struct kiss_config *)table;
    scale = 1./t->N;
-   kiss_fftr(t->forward, in, t->freq_data);
-   out[0] = scale*t->freq_data[0].r;
-   for (i=1;i<t->N>>1;i++)
-   {
-      out[(i<<1)-1] = scale*t->freq_data[i].r;
-      out[(i<<1)] = scale*t->freq_data[i].i;
-   }
-   out[(i<<1)-1] = scale*t->freq_data[i].r;
+   kiss_fftr2(t->forward, in, out);
+   for (i=0;i<t->N;i++)
+      out[i] *= scale;
 }
 #endif
 
 void spx_ifft(void *table, spx_word16_t *in, spx_word16_t *out)
 {
-   int i;
    struct kiss_config *t = (struct kiss_config *)table;
-   t->freq_data[0].r = in[0];
-   t->freq_data[0].i = 0;
-   for (i=1;i<t->N>>1;i++)
-   {
-      t->freq_data[i].r = in[(i<<1)-1];
-      t->freq_data[i].i = in[(i<<1)];
-   }
-   t->freq_data[i].r = in[(i<<1)-1];
-   t->freq_data[i].i = 0;
-
-   kiss_fftri(t->backward, t->freq_data, out);
+   kiss_fftri2(t->backward, in, out);
 }
 
 
diff --git a/libspeex/filterbank.c b/libspeex/filterbank.c
new file mode 100644
index 0000000..187d5ee
--- /dev/null
+++ b/libspeex/filterbank.c
@@ -0,0 +1,226 @@
+/* Copyright (C) 2006 Jean-Marc Valin */
+/**
+   @file filterbank.c
+   @brief Converting between psd and filterbank
+ */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "filterbank.h"
+#include "misc.h"
+#include <math.h>
+#include "math_approx.h"
+      
+#ifdef FIXED_POINT
+
+#define toBARK(n)   (MULT16_16(26829,spx_atan(SHR32(MULT16_16(97,n),2))) + MULT16_16(4588,spx_atan(MULT16_32_Q15(20,MULT16_16(n,n)))) + MULT16_16(3355,n))
+      
+#else
+#define toBARK(n)   (13.1f*atan(.00074f*(n))+2.24f*atan((n)*(n)*1.85e-8f)+1e-4f*(n))
+#endif
+       
+#define toMEL(n)    (2595.f*log10(1.f+(n)/700.f))
+
+FilterBank *filterbank_new(int banks, spx_word32_t sampling, int len, int type)
+{
+   FilterBank *bank;
+   spx_word32_t df;
+   spx_word32_t max_mel, mel_interval;
+   int i;
+   int id1;
+   int id2;
+   df = DIV32(SHL32(sampling,15),MULT16_16(2,len));
+   max_mel = toBARK(EXTRACT16(MULT16_16_Q15(QCONST16(.5f,15),sampling)));
+   mel_interval = PDIV32(max_mel,banks-1);
+   
+   bank = (FilterBank*)speex_alloc(sizeof(FilterBank));
+   bank->nb_banks = banks;
+   bank->len = len;
+   bank->bank_left = (int*)speex_alloc(len*sizeof(int));
+   bank->bank_right = (int*)speex_alloc(len*sizeof(int));
+   bank->filter_left = (spx_word16_t*)speex_alloc(len*sizeof(spx_word16_t));
+   bank->filter_right = (spx_word16_t*)speex_alloc(len*sizeof(spx_word16_t));
+   /* Think I can safely disable normalisation that for fixed-point (and probably float as well) */
+#ifndef FIXED_POINT
+   bank->scaling = (float*)speex_alloc(banks*sizeof(float));
+#endif
+   for (i=0;i<len;i++)
+   {
+      spx_word16_t curr_freq;
+      spx_word32_t mel;
+      spx_word16_t val;
+      curr_freq = EXTRACT16(MULT16_32_P15(i,df));
+      mel = toBARK(curr_freq);
+      if (mel > max_mel)
+         break;
+#ifdef FIXED_POINT
+      id1 = DIV32(mel,mel_interval);
+#else      
+      id1 = (int)(floor(mel/mel_interval));
+#endif
+      if (id1>banks-2)
+      {
+         id1 = banks-2;
+         val = Q15_ONE;
+      } else {
+         val = DIV32_16(mel - id1*mel_interval,EXTRACT16(PSHR32(mel_interval,15)));
+      }
+      id2 = id1+1;
+      bank->bank_left[i] = id1;
+      bank->filter_left[i] = SUB16(Q15_ONE,val);
+      bank->bank_right[i] = id2;
+      bank->filter_right[i] = val;
+   }
+   
+   /* Think I can safely disable normalisation for fixed-point (and probably float as well) */
+#ifndef FIXED_POINT
+   for (i=0;i<bank->nb_banks;i++)
+      bank->scaling[i] = 0;
+   for (i=0;i<bank->len;i++)
+   {
+      int id = bank->bank_left[i];
+      bank->scaling[id] += bank->filter_left[i];
+      id = bank->bank_right[i];
+      bank->scaling[id] += bank->filter_right[i];
+   }
+   for (i=0;i<bank->nb_banks;i++)
+      bank->scaling[i] = Q15_ONE/(bank->scaling[i]);
+#endif
+   return bank;
+}
+
+void filterbank_destroy(FilterBank *bank)
+{
+   speex_free(bank->bank_left);
+   speex_free(bank->bank_right);
+   speex_free(bank->filter_left);
+   speex_free(bank->filter_right);
+#ifndef FIXED_POINT
+   speex_free(bank->scaling);
+#endif
+   speex_free(bank);
+}
+
+void filterbank_compute_bank32(FilterBank *bank, spx_word32_t *ps, spx_word32_t *mel)
+{
+   int i;
+   for (i=0;i<bank->nb_banks;i++)
+      mel[i] = 0;
+
+   for (i=0;i<bank->len;i++)
+   {
+      int id;
+      id = bank->bank_left[i];
+      mel[id] += MULT16_32_P15(bank->filter_left[i],ps[i]);
+      id = bank->bank_right[i];
+      mel[id] += MULT16_32_P15(bank->filter_right[i],ps[i]);
+   }
+   /* Think I can safely disable normalisation that for fixed-point (and probably float as well) */
+#ifndef FIXED_POINT
+   /*for (i=0;i<bank->nb_banks;i++)
+      mel[i] = MULT16_32_P15(Q15(bank->scaling[i]),mel[i]);
+   */
+#endif
+}
+
+void filterbank_compute_psd16(FilterBank *bank, spx_word16_t *mel, spx_word16_t *ps)
+{
+   int i;
+   for (i=0;i<bank->len;i++)
+   {
+      spx_word32_t tmp;
+      int id1, id2;
+      id1 = bank->bank_left[i];
+      id2 = bank->bank_right[i];
+      tmp = MULT16_16(mel[id1],bank->filter_left[i]);
+      tmp += MULT16_16(mel[id2],bank->filter_right[i]);
+      ps[i] = EXTRACT16(PSHR32(tmp,15));
+   }
+}
+
+
+#ifndef FIXED_POINT
+void filterbank_compute_bank(FilterBank *bank, float *ps, float *mel)
+{
+   int i;
+   for (i=0;i<bank->nb_banks;i++)
+      mel[i] = 0;
+
+   for (i=0;i<bank->len;i++)
+   {
+      int id = bank->bank_left[i];
+      mel[id] += bank->filter_left[i]*ps[i];
+      id = bank->bank_right[i];
+      mel[id] += bank->filter_right[i]*ps[i];
+   }
+   for (i=0;i<bank->nb_banks;i++)
+      mel[i] *= bank->scaling[i];
+}
+
+void filterbank_compute_psd(FilterBank *bank, float *mel, float *ps)
+{
+   int i;
+   for (i=0;i<bank->len;i++)
+   {
+      int id = bank->bank_left[i];
+      ps[i] = mel[id]*bank->filter_left[i];
+      id = bank->bank_right[i];
+      ps[i] += mel[id]*bank->filter_right[i];
+   }
+}
+
+void filterbank_psy_smooth(FilterBank *bank, float *ps, float *mask)
+{
+   /* Low freq slope: 14 dB/Bark*/
+   /* High freq slope: 9 dB/Bark*/
+   /* Noise vs tone: 5 dB difference */
+   /* FIXME: Temporary kludge */
+   float bark[100];
+   int i;
+   /* Assumes 1/3 Bark resolution */
+   float decay_low = 0.34145f;
+   float decay_high = 0.50119f;
+   filterbank_compute_bank(bank, ps, bark);
+   for (i=1;i<bank->nb_banks;i++)
+   {
+      /*float decay_high = 13-1.6*log10(bark[i-1]);
+      decay_high = pow(10,(-decay_high/30.f));*/
+      bark[i] = bark[i] + decay_high*bark[i-1];
+   }
+   for (i=bank->nb_banks-2;i>=0;i--)
+   {
+      bark[i] = bark[i] + decay_low*bark[i+1];
+   }
+   filterbank_compute_psd(bank, bark, mask);
+}
+
+#endif
diff --git a/libspeex/filterbank.h b/libspeex/filterbank.h
new file mode 100644
index 0000000..5ded6b9
--- /dev/null
+++ b/libspeex/filterbank.h
@@ -0,0 +1,66 @@
+/* Copyright (C) 2006 Jean-Marc Valin */
+/**
+   @file filterbank.h
+   @brief Converting between psd and filterbank
+ */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FILTERBANK_H
+#define FILTERBANK_H
+
+#include "misc.h"
+
+typedef struct {
+   int *bank_left;
+   int *bank_right;
+   spx_word16_t *filter_left;
+   spx_word16_t *filter_right;
+#ifndef FIXED_POINT
+   float *scaling;
+#endif
+   int nb_banks;
+   int len;
+} FilterBank;
+
+
+FilterBank *filterbank_new(int banks, spx_word32_t sampling, int len, int type);
+
+void filterbank_destroy(FilterBank *bank);
+
+void filterbank_compute_bank32(FilterBank *bank, spx_word32_t *ps, spx_word32_t *mel);
+
+void filterbank_compute_psd16(FilterBank *bank, spx_word16_t *mel, spx_word16_t *psd);
+
+#ifndef FIXED_POINT
+void filterbank_compute_bank(FilterBank *bank, float *psd, float *mel);
+void filterbank_compute_psd(FilterBank *bank, float *mel, float *psd);
+#endif
+
+
+#endif
diff --git a/libspeex/filters.c b/libspeex/filters.c
index a1111ee..48b4753 100644
--- a/libspeex/filters.c
+++ b/libspeex/filters.c
@@ -62,6 +62,24 @@ void bw_lpc(spx_word16_t gamma, const spx_coef_t *lpc_in, spx_coef_t *lpc_out, i
    }
 }
 
+void sanitize_values32(spx_word32_t *vec, spx_word32_t min_val, spx_word32_t max_val, int len)
+{
+   int i;
+   for (i=0;i<len;i++)
+   {
+      /* It's important we do the test that way so we can catch NaNs, which are neither greater nor smaller */
+      if (!(vec[i]>=min_val && vec[i] <= max_val))
+      {
+         if (vec[i] < min_val)
+            vec[i] = min_val;
+         else if (vec[i] > max_val)
+            vec[i] = max_val;
+         else /* Has to be NaN */
+            vec[i] = 0;
+      }
+   }
+}
+
 void highpass(const spx_word16_t *x, spx_word16_t *y, int len, int filtID, spx_mem_t *mem)
 {
    int i;
@@ -83,8 +101,8 @@ void highpass(const spx_word16_t *x, spx_word16_t *y, int len, int filtID, spx_m
       spx_word16_t yi;
       spx_word32_t vout = ADD32(MULT16_16(num[0], x[i]),mem[0]);
       yi = EXTRACT16(SATURATE(PSHR32(vout,14),32767));
-      mem[0] = ADD32(MAC16_16(mem[1], num[1],x[i]), MULT16_32_Q14(-den[1],vout));
-      mem[1] = ADD32(MULT16_16(num[2],x[i]), MULT16_32_Q14(-den[2],vout));
+      mem[0] = ADD32(MAC16_16(mem[1], num[1],x[i]), SHL32(MULT16_32_Q15(-den[1],vout),1));
+      mem[1] = ADD32(MULT16_16(num[2],x[i]), SHL32(MULT16_32_Q15(-den[2],vout),1));
       y[i] = yi;
    }
 }
@@ -218,10 +236,10 @@ spx_word16_t compute_rms16(const spx_word16_t *x, int len)
       for (i=0;i<len;i+=4)
       {
          spx_word32_t sum2=0;
-         sum2 = MAC16_16(sum2,PSHR16(x[i],1),PSHR16(x[i],1));
-         sum2 = MAC16_16(sum2,PSHR16(x[i+1],1),PSHR16(x[i+1],1));
-         sum2 = MAC16_16(sum2,PSHR16(x[i+2],1),PSHR16(x[i+2],1));
-         sum2 = MAC16_16(sum2,PSHR16(x[i+3],1),PSHR16(x[i+3],1));
+         sum2 = MAC16_16(sum2,SHR16(x[i],1),SHR16(x[i],1));
+         sum2 = MAC16_16(sum2,SHR16(x[i+1],1),SHR16(x[i+1],1));
+         sum2 = MAC16_16(sum2,SHR16(x[i+2],1),SHR16(x[i+2],1));
+         sum2 = MAC16_16(sum2,SHR16(x[i+3],1),SHR16(x[i+3],1));
          sum = ADD32(sum,SHR32(sum2,6));
       }
       return SHL16(spx_sqrt(DIV32(sum,len)),4);
@@ -297,53 +315,6 @@ spx_word16_t compute_rms16(const spx_word16_t *x, int len)
 
 
 
-#ifndef OVERRIDE_FILTER_MEM2
-#ifdef PRECISION16
-void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_word16_t xi,yi,nyi;
-
-   for (i=0;i<N;i++)
-   {
-      xi= EXTRACT16(PSHR32(SATURATE(x[i],536870911),SIG_SHIFT));
-      yi = EXTRACT16(PSHR32(SATURATE(ADD32(x[i], SHL32(mem[0],1)),536870911),SIG_SHIFT));
-      nyi = NEG16(yi);
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = MAC16_16(MAC16_16(mem[j+1], num[j],xi), den[j],nyi);
-      }
-      mem[ord-1] = ADD32(MULT16_16(num[ord-1],xi), MULT16_16(den[ord-1],nyi));
-      y[i] = SHL32(EXTEND32(yi),SIG_SHIFT);
-   }
-}
-#else
-void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_sig_t xi,yi,nyi;
-
-   for (i=0;i<ord;i++)
-      mem[i] = SHR32(mem[i],1);   
-   for (i=0;i<N;i++)
-   {
-      xi=SATURATE(x[i],805306368);
-      yi = SATURATE(ADD32(xi, SHL32(mem[0],2)),805306368);
-      nyi = NEG32(yi);
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = MAC16_32_Q15(MAC16_32_Q15(mem[j+1], num[j],xi), den[j],nyi);
-      }
-      mem[ord-1] = SUB32(MULT16_32_Q15(num[ord-1],xi), MULT16_32_Q15(den[ord-1],yi));
-      y[i] = yi;
-   }
-   for (i=0;i<ord;i++)
-      mem[i] = SHL32(mem[i],1);   
-}
-#endif
-#endif
-
-#ifdef FIXED_POINT
 #ifndef OVERRIDE_FILTER_MEM16
 void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
 {
@@ -363,60 +334,7 @@ void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t
    }
 }
 #endif
-#else
-void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
-{
-   filter_mem2(x, num, den, y, N, ord, mem);
-}
-#endif
-
-
-#ifndef OVERRIDE_IIR_MEM2
-#ifdef PRECISION16
-void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_word16_t yi,nyi;
-
-   for (i=0;i<N;i++)
-   {
-      yi = EXTRACT16(PSHR32(SATURATE(x[i] + SHL32(mem[0],1),536870911),SIG_SHIFT));
-      nyi = NEG16(yi);
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = MAC16_16(mem[j+1],den[j],nyi);
-      }
-      mem[ord-1] = MULT16_16(den[ord-1],nyi);
-      y[i] = SHL32(EXTEND32(yi),SIG_SHIFT);
-   }
-}
-#else
-void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_word32_t xi,yi,nyi;
-
-   for (i=0;i<ord;i++)
-      mem[i] = SHR32(mem[i],1);   
-   for (i=0;i<N;i++)
-   {
-      xi=SATURATE(x[i],805306368);
-      yi = SATURATE(xi + SHL32(mem[0],2),805306368);
-      nyi = NEG32(yi);
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = MAC16_32_Q15(mem[j+1],den[j],nyi);
-      }
-      mem[ord-1] = MULT16_32_Q15(den[ord-1],nyi);
-      y[i] = yi;
-   }
-   for (i=0;i<ord;i++)
-      mem[i] = SHL32(mem[i],1);   
-}
-#endif
-#endif
 
-#ifdef FIXED_POINT
 #ifndef OVERRIDE_IIR_MEM16
 void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
 {
@@ -436,59 +354,7 @@ void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, in
    }
 }
 #endif
-#else
-void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
-{
-   iir_mem2(x, den, y, N, ord, mem);
-}
-#endif
-
-
-#ifndef OVERRIDE_FIR_MEM2
-#ifdef PRECISION16
-void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_word16_t xi,yi;
-
-   for (i=0;i<N;i++)
-   {
-      xi= EXTRACT16(PSHR32(SATURATE(x[i],536870911),SIG_SHIFT));
-      yi = EXTRACT16(PSHR32(SATURATE(x[i] + SHL32(mem[0],1),536870911),SIG_SHIFT));
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = MAC16_16(mem[j+1], num[j],xi);
-      }
-      mem[ord-1] = MULT16_16(num[ord-1],xi);
-      y[i] = SHL32(EXTEND32(yi),SIG_SHIFT);
-   }
-}
-#else
-void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_word32_t xi,yi;
-
-   for (i=0;i<ord;i++)
-      mem[i] = SHR32(mem[i],1);   
-   for (i=0;i<N;i++)
-   {
-      xi=SATURATE(x[i],805306368);
-      yi = xi + SHL32(mem[0],2);
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = MAC16_32_Q15(mem[j+1], num[j],xi);
-      }
-      mem[ord-1] = MULT16_32_Q15(num[ord-1],xi);
-      y[i] = SATURATE(yi,805306368);
-   }
-   for (i=0;i<ord;i++)
-      mem[i] = SHL32(mem[i],1);   
-}
-#endif
-#endif
 
-#ifdef FIXED_POINT
 #ifndef OVERRIDE_FIR_MEM16
 void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
 {
@@ -508,44 +374,34 @@ void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, in
    }
 }
 #endif
-#else
-void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
-{
-   fir_mem2(x, num, y, N, ord, mem);
-}
-#endif
 
 
-
-
-
-
-void syn_percep_zero(const spx_sig_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_sig_t *y, int N, int ord, char *stack)
+void syn_percep_zero16(const spx_word16_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack)
 {
    int i;
    VARDECL(spx_mem_t *mem);
    ALLOC(mem, ord, spx_mem_t);
    for (i=0;i<ord;i++)
-     mem[i]=0;
-   iir_mem2(xx, ak, y, N, ord, mem);
+      mem[i]=0;
+   iir_mem16(xx, ak, y, N, ord, mem, stack);
    for (i=0;i<ord;i++)
       mem[i]=0;
-   filter_mem2(y, awk1, awk2, y, N, ord, mem);
+   filter_mem16(y, awk1, awk2, y, N, ord, mem, stack);
 }
-
-void residue_percep_zero(const spx_sig_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_sig_t *y, int N, int ord, char *stack)
+void residue_percep_zero16(const spx_word16_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack)
 {
    int i;
    VARDECL(spx_mem_t *mem);
    ALLOC(mem, ord, spx_mem_t);
    for (i=0;i<ord;i++)
       mem[i]=0;
-   filter_mem2(xx, ak, awk1, y, N, ord, mem);
+   filter_mem16(xx, ak, awk1, y, N, ord, mem, stack);
    for (i=0;i<ord;i++)
-     mem[i]=0;
-   fir_mem2(y, awk2, y, N, ord, mem);
+      mem[i]=0;
+   fir_mem16(y, awk2, y, N, ord, mem, stack);
 }
 
+
 #ifndef OVERRIDE_COMPUTE_IMPULSE_RESPONSE
 void compute_impulse_response(const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack)
 {
@@ -581,7 +437,8 @@ void compute_impulse_response(const spx_coef_t *ak, const spx_coef_t *awk1, cons
 }
 #endif
 
-void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_sig_t *y1, spx_sig_t *y2, int N, int M, spx_word16_t *mem, char *stack)
+/* Decomposes a signal into low-band and high-band using a QMF */
+void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1, spx_word16_t *y2, int N, int M, spx_word16_t *mem, char *stack)
 {
    int i,j,k,M2;
    VARDECL(spx_word16_t *a);
@@ -594,105 +451,139 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_sig_t *y1, s
    M2=M>>1;
    for (i=0;i<M;i++)
       a[M-i-1]= aa[i];
-
    for (i=0;i<M-1;i++)
       x[i]=mem[M-i-2];
    for (i=0;i<N;i++)
-      x[i+M-1]=SATURATE(PSHR(xx[i],1),16383);
+      x[i+M-1]=SHR16(xx[i],1);
+   for (i=0;i<M-1;i++)
+      mem[i]=SHR16(xx[N-i-1],1);
    for (i=0,k=0;i<N;i+=2,k++)
    {
-      y1[k]=0;
-      y2[k]=0;
+      spx_word32_t y1k=0, y2k=0;
       for (j=0;j<M2;j++)
       {
-         y1[k]=ADD32(y1[k],MULT16_16(a[j],ADD16(x[i+j],x2[i-j])));
-         y2[k]=SUB32(y2[k],MULT16_16(a[j],SUB16(x[i+j],x2[i-j])));
+         y1k=ADD32(y1k,MULT16_16(a[j],ADD16(x[i+j],x2[i-j])));
+         y2k=SUB32(y2k,MULT16_16(a[j],SUB16(x[i+j],x2[i-j])));
          j++;
-         y1[k]=ADD32(y1[k],MULT16_16(a[j],ADD16(x[i+j],x2[i-j])));
-         y2[k]=ADD32(y2[k],MULT16_16(a[j],SUB16(x[i+j],x2[i-j])));
+         y1k=ADD32(y1k,MULT16_16(a[j],ADD16(x[i+j],x2[i-j])));
+         y2k=ADD32(y2k,MULT16_16(a[j],SUB16(x[i+j],x2[i-j])));
       }
-      y1[k] = SHR32(y1[k],1);
-      y2[k] = SHR32(y2[k],1);
+      y1[k] = EXTRACT16(SATURATE(PSHR32(y1k,15),32767));
+      y2[k] = EXTRACT16(SATURATE(PSHR32(y2k,15),32767));
    }
-   for (i=0;i<M-1;i++)
-     mem[i]=SATURATE(PSHR(xx[N-i-1],1),16383);
 }
 
-
-/* By segher */
-void fir_mem_up(const spx_sig_t *x, const spx_word16_t *a, spx_sig_t *y, int N, int M, spx_word32_t *mem, char *stack)
+/* Re-synthesised a signal from the QMF low-band and high-band signals */
+void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
    /* assumptions:
       all odd x[i] are zero -- well, actually they are left out of the array now
       N and M are multiples of 4 */
 {
    int i, j;
-   VARDECL(spx_word16_t *xx);
+   int M2, N2;
+   VARDECL(spx_word16_t *xx1);
+   VARDECL(spx_word16_t *xx2);
    
-   ALLOC(xx, M+N-1, spx_word16_t);
-
-   for (i = 0; i < N/2; i++)
-      xx[2*i] = PSHR32(x[N/2-1-i],SIG_SHIFT);
-   for (i = 0; i < M - 1; i += 2)
-      xx[N+i] = mem[i+1];
-
-   for (i = 0; i < N; i += 4) {
+   M2 = M>>1;
+   N2 = N>>1;
+   ALLOC(xx1, M2+N2, spx_word16_t);
+   ALLOC(xx2, M2+N2, spx_word16_t);
+
+   for (i = 0; i < N2; i++)
+      xx1[i] = x1[N2-1-i];
+   for (i = 0; i < M2; i++)
+      xx1[N2+i] = mem1[2*i+1];
+   for (i = 0; i < N2; i++)
+      xx2[i] = x2[N2-1-i];
+   for (i = 0; i < M2; i++)
+      xx2[N2+i] = mem2[2*i+1];
+
+   for (i = 0; i < N2; i += 2) {
       spx_sig_t y0, y1, y2, y3;
-      spx_word16_t x0;
+      spx_word16_t x10, x20;
 
       y0 = y1 = y2 = y3 = 0;
-      x0 = xx[N-4-i];
+      x10 = xx1[N2-2-i];
+      x20 = xx2[N2-2-i];
 
-      for (j = 0; j < M; j += 4) {
-         spx_word16_t x1;
+      for (j = 0; j < M2; j += 2) {
+         spx_word16_t x11, x21;
          spx_word16_t a0, a1;
 
-         a0 = a[j];
-         a1 = a[j+1];
-         x1 = xx[N-2+j-i];
-
-         y0 = ADD32(y0,SHR(MULT16_16(a0, x1),2));
-         y1 = ADD32(y1,SHR(MULT16_16(a1, x1),2));
-         y2 = ADD32(y2,SHR(MULT16_16(a0, x0),2));
-         y3 = ADD32(y3,SHR(MULT16_16(a1, x0),2));
+         a0 = a[2*j];
+         a1 = a[2*j+1];
+         x11 = xx1[N2-1+j-i];
+         x21 = xx2[N2-1+j-i];
 
-         a0 = a[j+2];
-         a1 = a[j+3];
-         x0 = xx[N+j-i];
+#ifdef FIXED_POINT
+         /* We multiply twice by the same coef to avoid overflows */
+         y0 = MAC16_16(MAC16_16(y0, a0, x11), NEG16(a0), x21);
+         y1 = MAC16_16(MAC16_16(y1, a1, x11), a1, x21);
+         y2 = MAC16_16(MAC16_16(y2, a0, x10), NEG16(a0), x20);
+         y3 = MAC16_16(MAC16_16(y3, a1, x10), a1, x20);
+#else
+         y0 = ADD32(y0,MULT16_16(a0, x11-x21));
+         y1 = ADD32(y1,MULT16_16(a1, x11+x21));
+         y2 = ADD32(y2,MULT16_16(a0, x10-x20));
+         y3 = ADD32(y3,MULT16_16(a1, x10+x20));
+#endif
+         a0 = a[2*j+2];
+         a1 = a[2*j+3];
+         x10 = xx1[N2+j-i];
+         x20 = xx2[N2+j-i];
 
-         y0 = ADD32(y0,SHR(MULT16_16(a0, x0),2));
-         y1 = ADD32(y1,SHR(MULT16_16(a1, x0),2));
-         y2 = ADD32(y2,SHR(MULT16_16(a0, x1),2));
-         y3 = ADD32(y3,SHR(MULT16_16(a1, x1),2));
+#ifdef FIXED_POINT
+         /* We multiply twice by the same coef to avoid overflows */
+         y0 = MAC16_16(MAC16_16(y0, a0, x10), NEG16(a0), x20);
+         y1 = MAC16_16(MAC16_16(y1, a1, x10), a1, x20);
+         y2 = MAC16_16(MAC16_16(y2, a0, x11), NEG16(a0), x21);
+         y3 = MAC16_16(MAC16_16(y3, a1, x11), a1, x21);
+#else
+         y0 = ADD32(y0,MULT16_16(a0, x10-x20));
+         y1 = ADD32(y1,MULT16_16(a1, x10+x20));
+         y2 = ADD32(y2,MULT16_16(a0, x11-x21));
+         y3 = ADD32(y3,MULT16_16(a1, x11+x21));
+#endif
       }
-      y[i] = y0;
-      y[i+1] = y1;
-      y[i+2] = y2;
-      y[i+3] = y3;
+#ifdef FIXED_POINT
+      y[2*i] = EXTRACT16(SATURATE32(PSHR32(y0,15),32767));
+      y[2*i+1] = EXTRACT16(SATURATE32(PSHR32(y1,15),32767));
+      y[2*i+2] = EXTRACT16(SATURATE32(PSHR32(y2,15),32767));
+      y[2*i+3] = EXTRACT16(SATURATE32(PSHR32(y3,15),32767));
+#else
+      /* Normalize up explicitly if we're in float */
+      y[2*i] = 2.f*y0;
+      y[2*i+1] = 2.f*y1;
+      y[2*i+2] = 2.f*y2;
+      y[2*i+3] = 2.f*y3;
+#endif
    }
 
-   for (i = 0; i < M - 1; i += 2)
-      mem[i+1] = xx[i];
+   for (i = 0; i < M2; i++)
+      mem1[2*i+1] = xx1[i];
+   for (i = 0; i < M2; i++)
+      mem2[2*i+1] = xx2[i];
 }
 
 #ifdef FIXED_POINT
 #if 0
-spx_word16_t shift_filt[3][7] = {{-33,    1043,   -4551,   19959,   19959,   -4551,    1043},
+const spx_word16_t shift_filt[3][7] = {{-33,    1043,   -4551,   19959,   19959,   -4551,    1043},
                                  {-98,    1133,   -4425,   29179,    8895,   -2328,     444},
                                  {444,   -2328,    8895,   29179,   -4425,    1133,     -98}};
 #else
-spx_word16_t shift_filt[3][7] = {{-390,    1540,   -4993,   20123,   20123,   -4993,    1540},
+const spx_word16_t shift_filt[3][7] = {{-390,    1540,   -4993,   20123,   20123,   -4993,    1540},
                                 {-1064,    2817,   -6694,   31589,    6837,    -990,    -209},
                                  {-209,    -990,    6837,   31589,   -6694,    2817,   -1064}};
 #endif
 #else
 #if 0
-float shift_filt[3][7] = {{-9.9369e-04, 3.1831e-02, -1.3889e-01, 6.0910e-01, 6.0910e-01, -1.3889e-01, 3.1831e-02},
+const float shift_filt[3][7] = {{-9.9369e-04, 3.1831e-02, -1.3889e-01, 6.0910e-01, 6.0910e-01, -1.3889e-01, 3.1831e-02},
                           {-0.0029937, 0.0345613, -0.1350474, 0.8904793, 0.2714479, -0.0710304, 0.0135403},
                           {0.0135403, -0.0710304, 0.2714479, 0.8904793, -0.1350474, 0.0345613,  -0.0029937}};
 #else
-float shift_filt[3][7] = {{-0.011915, 0.046995, -0.152373, 0.614108, 0.614108, -0.152373, 0.046995},
-                          {-0.0324855, 0.0859768, -0.2042986, 0.9640297, 0.2086420, -0.0302054, -0.0063646},
-                          {-0.0063646, -0.0302054, 0.2086420, 0.9640297, -0.2042986, 0.0859768, -0.0324855}};
+const float shift_filt[3][7] = {{-0.011915f, 0.046995f, -0.152373f, 0.614108f, 0.614108f, -0.152373f, 0.046995f},
+                          {-0.0324855f, 0.0859768f, -0.2042986f, 0.9640297f, 0.2086420f, -0.0302054f, -0.0063646f},
+                          {-0.0063646f, -0.0302054f, 0.2086420f, 0.9640297f, -0.2042986f, 0.0859768f, -0.0324855f}};
 #endif
 #endif
 
@@ -784,7 +675,9 @@ char *stack
    spx_word16_t g1, g2;
    spx_word16_t ngain;
    spx_word16_t gg1, gg2;
-
+#ifdef FIXED_POINT
+   int scaledown=0;
+#endif
 #if 0 /* Set to 1 to enable full pitch search */
    int nol_pitch[6];
    spx_word16_t nol_pitch_coef[6];
@@ -819,6 +712,23 @@ char *stack
    else
       interp_pitch(exc, iexc+nsf, -corr_pitch, 80);
 
+#ifdef FIXED_POINT
+   for (i=0;i<nsf;i++)
+   {
+      if (ABS16(exc[i])>16383)
+      {
+         scaledown = 1;
+         break;
+      }
+   }
+   if (scaledown)
+   {
+      for (i=0;i<nsf;i++)
+         exc[i] = SHR16(exc[i],1);
+      for (i=0;i<2*nsf;i++)
+         iexc[i] = SHR16(iexc[i],1);
+   }
+#endif
    /*interp_pitch(exc, iexc+2*nsf, 2*corr_pitch, 80);*/
    
    /*printf ("%d %d %f\n", pitch, corr_pitch, max_corr*ener_1);*/
@@ -898,5 +808,14 @@ char *stack
    
    for (i=0;i<nsf;i++)
       new_exc[i] = MULT16_16_Q14(ngain, new_exc[i]);
+#ifdef FIXED_POINT
+   if (scaledown)
+   {
+      for (i=0;i<nsf;i++)
+         exc[i] = SHL16(exc[i],1);
+      for (i=0;i<nsf;i++)
+         new_exc[i] = SHL16(SATURATE16(new_exc[i],16383),1);
+   }
+#endif
 }
 
diff --git a/libspeex/filters.h b/libspeex/filters.h
index b29aa21..b363a9a 100644
--- a/libspeex/filters.h
+++ b/libspeex/filters.h
@@ -58,13 +58,8 @@ int normalize16(const spx_sig_t *x, spx_word16_t *y, spx_sig_t max_scale, int le
 void highpass(const spx_word16_t *x, spx_word16_t *y, int len, int filtID, spx_mem_t *mem);
 
 
-void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_sig_t *, spx_sig_t *y2, int N, int M, spx_word16_t *mem, char *stack);
-void fir_mem_up(const spx_sig_t *x, const spx_word16_t *a, spx_sig_t *y, int N, int M, spx_word32_t *mem, char *stack);
-
-
-void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem);
-void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem);
-void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem);
+void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *, spx_word16_t *y2, int N, int M, spx_word16_t *mem, char *stack);
+void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack);
 
 void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack);
 void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack);
@@ -72,12 +67,11 @@ void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, in
 
 /* Apply bandwidth expansion on LPC coef */
 void bw_lpc(spx_word16_t , const spx_coef_t *lpc_in, spx_coef_t *lpc_out, int order);
+void sanitize_values32(spx_word32_t *vec, spx_word32_t min_val, spx_word32_t max_val, int len);
 
 
-
-void syn_percep_zero(const spx_sig_t *x, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_sig_t *y, int N, int ord, char *stack);
-
-void residue_percep_zero(const spx_sig_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_sig_t *y, int N, int ord, char *stack);
+void syn_percep_zero16(const spx_word16_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack);
+void residue_percep_zero16(const spx_word16_t *xx, const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack);
 
 void compute_impulse_response(const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack);
 
diff --git a/libspeex/filters_arm4.h b/libspeex/filters_arm4.h
index ac4d7a9..9138610 100644
--- a/libspeex/filters_arm4.h
+++ b/libspeex/filters_arm4.h
@@ -95,295 +95,3 @@ int normalize16(const spx_sig_t *x, spx_word16_t *y, int max_scale, int len)
    return sig_shift;
 }
 
-#define OVERRIDE_FILTER_MEM2
-void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_sig_t xi,yi,nyi;
-
-   for (i=0;i<ord;i++)
-      mem[i] = SHR32(mem[i],1);   
-   for (i=0;i<N;i++)
-   {
-      int deadm, deadn, deadd, deadidx, x1, y1, dead1, dead2, dead3, dead4, dead5, dead6;
-      xi=SATURATE(x[i],805306368);
-      yi = SATURATE(ADD32(xi, SHL(mem[0],2)),805306368);
-      nyi = -yi;
-      y[i] = yi;
-      __asm__ __volatile__ (
-            "\tldrsh %6, [%1], #2\n"
-            "\tsmull %8, %9, %4, %6\n"
-#ifdef SHORTCUTS
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-
-#else
-            ".filterloop%=: \n"
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tsubs %3, %3, #1\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-            "\t bne .filterloop%=\n"
-#endif
-            "\tmov %8, %8, lsr #15\n"
-            "\tadd %10, %8, %9, lsl #17\n"
-            "\tldrsh %6, [%2], #2\n"
-            "\tsmull %8, %9, %5, %6\n"
-            "\tadd %10, %10, %8, lsr #15\n"
-            "\tadd %10, %10, %9, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-         : "=r" (deadm), "=r" (deadn), "=r" (deadd), "=r" (deadidx),
-      "=r" (xi), "=r" (nyi), "=r" (dead1), "=r" (dead2),
-      "=r" (dead3), "=r" (dead4), "=r" (dead5), "=r" (dead6)
-         : "0" (mem), "1" (num), "2" (den), "3" (ord-1), "4" (xi), "5" (nyi)
-         : "cc", "memory");
-   
-   }
-   for (i=0;i<ord;i++)
-      mem[i] = SHL32(mem[i],1);   
-}
-
-#define OVERRIDE_IIR_MEM2
-void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_sig_t xi,yi,nyi;
-
-   for (i=0;i<ord;i++)
-      mem[i] = SHR32(mem[i],1);   
-
-   for (i=0;i<N;i++)
-   {
-      int deadm, deadd, deadidx, dead1, dead2, dead3, dead4, dead5, dead6;
-      xi=SATURATE(x[i],805306368);
-      yi = SATURATE(ADD32(xi, SHL(mem[0],2)),805306368);
-      nyi = -yi;
-      y[i] = yi;
-      __asm__ __volatile__ (
-            "\tldrsh %4, [%1], #2\n"
-            "\tsmull %5, %6, %3, %4\n"
-
-#ifdef SHORTCUTS
-                        
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %7, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-
-                 
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %9, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %9, %9, %8\n"
-            "\tstr %9, [%0], #4 \n"
-
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %7, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-
-            
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %9, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %9, %9, %8\n"
-            "\tstr %9, [%0], #4 \n"
-
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %7, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-
-            
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %9, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %9, %9, %8\n"
-            "\tstr %9, [%0], #4 \n"
-
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %7, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-
-            
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %9, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %9, %9, %8\n"
-            "\tstr %9, [%0], #4 \n"
-
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tldr %7, [%0, #4]\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-
-            
-            
-#else
-            ".iirloop%=: \n"
-            "\tldr %7, [%0, #4]\n"
-
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-            "\tsubs %2, %2, #1\n"
-            "\t bne .iirloop%=\n"
-            
-#endif
-            "\tmov %5, %5, lsr #15\n"
-            "\tadd %7, %5, %6, lsl #17\n"
-            "\tstr %7, [%0], #4 \n"
-
-         : "=r" (deadm), "=r" (deadd), "=r" (deadidx), "=r" (nyi),
-      "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4),
-      "=r" (dead5), "=r" (dead6)
-         : "0" (mem), "1" (den), "2" (ord-1), "3" (nyi)
-         : "cc", "memory");
-   
-   }
-   for (i=0;i<ord;i++)
-      mem[i] = SHL32(mem[i],1);   
-
-}
diff --git a/libspeex/filters_bfin.h b/libspeex/filters_bfin.h
index 2180ed4..1e433ee 100644
--- a/libspeex/filters_bfin.h
+++ b/libspeex/filters_bfin.h
@@ -79,143 +79,6 @@ int normalize16(const spx_sig_t *x, spx_word16_t *y, spx_sig_t max_scale, int le
    return sig_shift;
 }
 
-#define OVERRIDE_FILTER_MEM2
-void filter_mem2(const spx_sig_t *_x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *_y, int N, int ord, spx_mem_t *mem)
-{
-   spx_word32_t xy2[N+1];
-   spx_word32_t *xy = xy2+1;
-   spx_word32_t numden_a[2*ord+2];
-   spx_word16_t *numden = (spx_word16_t*) numden_a;
-   int i;
-   for (i=0;i<ord;i++)
-   {
-      numden[2*i] = num[i];
-      numden[2*i+1] = den[i];
-   }
-   __asm__ __volatile__
-   (
-   /* Register setup */
-   "R0 = %5;\n\t"      /*ord */
-   
-   "P0 = %3;\n\t"
-   "I0 = P0;\n\t"
-   "B0 = P0;\n\t" /* numden */
-   "L0 = 0;\n\t"
-      
-   "P2 = %0;\n\t" /* Fused xy */
-   "I2 = P2;\n\t"
-   "L2 = 0;\n\t"
-   
-   "P4 = %6;\n\t" /* mem */
-   "P0 = %1;\n\t" /* _x */
-   "P1 = %2;\n\t" /* _y */
-   
-   /* First sample */
-   "R1 = [P4++];\n\t"
-   "R1 <<= 1;\n\t" /* shift mem */
-   "R2 = [P0++];\n\t" /* load x[0] */
-   "R1 = R1 + R2;\n\t"
-   "[P1++] = R1;\n\t" /* store y[0] */
-   "R1 <<= 2;\n\t"
-   "R2 <<= 2;\n\t"
-   "R2 = PACK(R1.H, R2.H);\n\t" /* pack x16 and y16 */
-   "[P2] = R2;\n\t"
-               
-   /* Samples 1 to ord-1 (using memory) */
-   "R0 += -1;\n\t"
-   "R3 = 0;\n\t"
-   "LC0 = R0;\n\t"
-   "LOOP filter_start%= LC0;\n\t"
-   "LOOP_BEGIN filter_start%=;\n\t"
-      "R3 += 1;\n\t"
-      "LC1 = R3;\n\t"
-      
-      "R1 = [P4++];\n\t"
-      "A1 = R1;\n\t"
-      "A0 = 0;\n\t"
-      "I0 = B0;\n\t"
-      "I2 = P2;\n\t"
-      "P2 += 4;\n\t"
-      "R4 = [I0++] || R5 = [I2--];\n\t"
-      "LOOP filter_start_inner%= LC1;\n\t"
-      "LOOP_BEGIN filter_start_inner%=;\n\t"
-         "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
-      "LOOP_END filter_start_inner%=;\n\t"
-      "A0 += A1;\n\t"
-      "R4 = A0;\n\t"
-      "R4 <<= 1;\n\t" /* shift mem */
-      "R2 = [P0++];\n\t" /* load x */
-      "R4 = R4 + R2;\n\t"
-      "[P1++] = R4;\n\t" /* store y */
-      "R4 <<= 2;\n\t"
-      "R2 <<= 2;\n\t"
-      "R2 = PACK(R4.H, R2.H);\n\t" /* pack x16 and y16 */
-      "[P2] = R2;\n\t"
-
-   "LOOP_END filter_start%=;\n\t"
-
-   /* Samples ord to N*/   
-   "R0 = %5;\n\t"
-   "R0 <<= 1;\n\t"
-   "I0 = B0;\n\t" /* numden */
-   "R0 <<= 1;\n\t"   
-   "L0 = R0;\n\t"
-   
-   "R0 = %5;\n\t" /* org */
-   "R2 = %4;\n\t" /* N */
-   "R2 = R2 - R0;\n\t"
-   "R4 = [I0++];\n\t" /* numden */
-   "LC0 = R2;\n\t"
-   "P3 = R0;\n\t"
-   "R0 <<= 2;\n\t"
-   "R0 += 8;\n\t"
-   "I2 = P2;\n\t"
-   "M0 = R0;\n\t"
-   "A1 = A0 = 0;\n\t"
-   "R5 = [I2--];\n\t" /* load xy */
-   "LOOP filter_mid%= LC0;\n\t"
-   "LOOP_BEGIN filter_mid%=;\n\t"
-      "LOOP filter_mid_inner%= LC1=P3;\n\t"
-      "LOOP_BEGIN filter_mid_inner%=;\n\t"
-         "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
-      "LOOP_END filter_mid_inner%=;\n\t"
-      "R0 = (A0 += A1) || I2 += M0;\n\t"
-      "R0 = R0 << 1 || R5 = [P0++];\n\t" /* load x */
-      "R0 = R0 + R5;\n\t"
-      "R0 = R0 << 2 || [P1++] = R0;\n\t" /* shift y | store y */
-      "R5 = R5 << 2;\n\t"
-      "R5 = PACK(R0.H, R5.H);\n\t"
-      "A1 = A0 = 0 || [I2--] = R5\n\t"
-      "LOOP_END filter_mid%=;\n\t"
-   "I2 += 4;\n\t"
-   "P2 = I2;\n\t"
-   /* Update memory */
-   "P4 = %6;\n\t"
-   "R0 = %5;\n\t"
-   "LC0 = R0;\n\t"
-   "P0 = B0;\n\t"
-   "A1 = A0 = 0;\n\t"
-   "LOOP mem_update%= LC0;\n\t"
-   "LOOP_BEGIN mem_update%=;\n\t"
-      "I2 = P2;\n\t"
-      "I0 = P0;\n\t"
-      "P0 += 4;\n\t"
-      "R0 = LC0;\n\t"
-      "LC1 = R0;\n\t"
-      "R5 = [I2--] || R4 = [I0++];\n\t"
-      "LOOP mem_accum%= LC1;\n\t"
-      "LOOP_BEGIN mem_accum%=;\n\t"
-         "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
-      "LOOP_END mem_accum%=;\n\t"
-      "R0 = (A0 += A1);\n\t"
-      "A1 = A0 = 0 || [P4++] = R0;\n\t"
-   "LOOP_END mem_update%=;\n\t"
-   "L0 = 0;\n\t"
-   : : "m" (xy), "m" (_x), "m" (_y), "m" (numden), "m" (N), "m" (ord), "m" (mem)
-   : "A0", "A1", "R0", "R1", "R2", "R3", "R4", "R5", "P0", "P1", "P2", "P3", "P4", "B0", "I0", "I2", "L0", "L2", "M0", "memory"
-   );
-
-}
 
 
 #define OVERRIDE_FILTER_MEM16
@@ -363,130 +226,6 @@ void filter_mem16(const spx_word16_t *_x, const spx_coef_t *num, const spx_coef_
 
 
 
-
-#define OVERRIDE_IIR_MEM2
-void iir_mem2(const spx_sig_t *_x, const spx_coef_t *den, spx_sig_t *_y, int N, int ord, spx_mem_t *mem)
-{
-   spx_word16_t y[N+2];
-   spx_word16_t *yy;
-   yy = y+2;
-   __asm__ __volatile__
-   (
-   /* Register setup */
-   "R0 = %5;\n\t"      /*ord */
-   
-   "P1 = %3;\n\t"
-   "I1 = P1;\n\t"
-   "B1 = P1;\n\t"
-   "L1 = 0;\n\t"
-   
-   "P3 = %0;\n\t"
-   "I3 = P3;\n\t"
-   "L3 = 0;\n\t"
-   
-   "P4 = %6;\n\t"
-   "P0 = %1;\n\t"
-   "P1 = %2;\n\t"
-   
-   /* First sample */
-   "R1 = [P4++];\n\t"
-   "R1 <<= 1;\n\t"
-   "R2 = [P0++];\n\t"
-   "R1 = R1 + R2;\n\t"
-   "[P1++] = R1;\n\t"
-   "R1 <<= 2;\n\t"
-   "W[P3] = R1.H;\n\t"
-   "R2 <<= 2;\n\t"
-
-   /* Samples 1 to ord-1 (using memory) */
-   "R0 += -1;\n\t"
-   "R3 = 0;\n\t"
-   "LC0 = R0;\n\t"
-   "LOOP filter_start%= LC0;\n\t"
-   "LOOP_BEGIN filter_start%=;\n\t"
-      "R3 += 1;\n\t"
-      "LC1 = R3;\n\t"
-      
-      "R1 = [P4++];\n\t"
-      "A1 = R1;\n\t"
-      "I1 = B1;\n\t"
-      "I3 = P3;\n\t"
-      "P3 += 2;\n\t"
-      "LOOP filter_start_inner%= LC1;\n\t"
-      "LOOP_BEGIN filter_start_inner%=;\n\t"
-         "R4.L = W[I1++];\n\t"
-         "R5.L = W[I3--];\n\t"
-         "A1 -= R4.L*R5.L (IS);\n\t"
-      "LOOP_END filter_start_inner%=;\n\t"
-   
-      "R1 = A1;\n\t"
-      "R1 <<= 1;\n\t"
-      "R2 = [P0++];\n\t"
-      "R1 = R1 + R2;\n\t"
-      "[P1++] = R1;\n\t"
-      "R1 <<= 2;\n\t"
-      "W[P3] = R1.H;\n\t"
-      "R2 <<= 2;\n\t"
-   "LOOP_END filter_start%=;\n\t"
-
-   /* Samples ord to N*/   
-   "R0 = %5;\n\t"
-   "R0 <<= 1;\n\t"
-   "I1 = B1;\n\t"
-   "L1 = R0;\n\t"
-   
-   "R0 = %5;\n\t"
-   "R2 = %4;\n\t"
-   "R2 = R2 - R0;\n\t"
-   "R4.L = W[I1++];\n\t"
-   "LC0 = R2;\n\t"
-   "LOOP filter_mid%= LC0;\n\t"
-   "LOOP_BEGIN filter_mid%=;\n\t"
-      "LC1 = R0;\n\t"
-      "A1 = 0;\n\t"
-      "I3 = P3;\n\t"
-      "P3 += 2;\n\t"
-      "R5.L = W[I3--];\n\t"
-      "LOOP filter_mid_inner%= LC1;\n\t"
-      "LOOP_BEGIN filter_mid_inner%=;\n\t"
-         "A1 -= R4.L*R5.L (IS) || R4.L = W[I1++] || R5.L = W[I3--];\n\t"
-      "LOOP_END filter_mid_inner%=;\n\t"
-      "R1 = A1;\n\t"
-      "R1 = R1 << 1 || R2 = [P0++];\n\t"
-      "R1 = R1 + R2;\n\t"
-      "R1 = R1 << 2 || [P1++] = R1;\n\t"
-      "W[P3] = R1.H;\n\t"
-   "LOOP_END filter_mid%=;\n\t"
-     
-   /* Update memory */
-   "P4 = %6;\n\t"
-   "R0 = %5;\n\t"
-   "LC0 = R0;\n\t"
-   "P1 = B1;\n\t"
-   "LOOP mem_update%= LC0;\n\t"
-   "LOOP_BEGIN mem_update%=;\n\t"
-      "A0 = 0;\n\t"
-      "I3 = P3;\n\t"
-      "I1 = P1;\n\t"
-      "P1 += 2;\n\t"
-      "R0 = LC0;\n\t"
-      "LC1=R0;\n\t"
-      "R5.L = W[I3--] || R4.L = W[I1++];\n\t"
-      "LOOP mem_accum%= LC1;\n\t"
-      "LOOP_BEGIN mem_accum%=;\n\t"
-         "A0 -= R4.L*R5.L (IS) || R4.L = W[I1++] || R5.L = W[I3--];\n\t"
-      "LOOP_END mem_accum%=;\n\t"
-      "R0 = A0;\n\t"
-      "[P4++] = R0;\n\t"
-   "LOOP_END mem_update%=;\n\t"
-   "L1 = 0;\n\t"
-   : : "m" (yy), "m" (_x), "m" (_y), "m" (den), "m" (N), "m" (ord), "m" (mem)
-   : "A0", "A1", "R0", "R1", "R2", "R3", "R4", "R5", "P0", "P1", "P2", "P3", "P4", "B1", "I1", "I3", "L1", "L3", "memory"
-   );
-
-}
-
-
 #define OVERRIDE_IIR_MEM16
 void iir_mem16(const spx_word16_t *_x, const spx_coef_t *den, spx_word16_t *_y, int N, int ord, spx_mem_t *mem, char *stack)
 {
@@ -612,18 +351,6 @@ void iir_mem16(const spx_word16_t *_x, const spx_coef_t *den, spx_word16_t *_y,
 }
 
 
-#define OVERRIDE_FIR_MEM2
-void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i;
-   spx_coef_t den2[12];
-   spx_coef_t *den;
-   den = (spx_coef_t*)((((int)den2)+4)&0xfffffffc);
-   for (i=0;i<10;i++)
-      den[i] = 0;
-   filter_mem2(x, num, den, y, N, ord, mem);
-}
-
 #define OVERRIDE_FIR_MEM16
 void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
 {
diff --git a/libspeex/filters_sse.h b/libspeex/filters_sse.h
index 2f03747..4bb333d 100644
--- a/libspeex/filters_sse.h
+++ b/libspeex/filters_sse.h
@@ -34,7 +34,7 @@
 
 #include <xmmintrin.h>
 
-void filter_mem2_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
+void filter_mem16_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
 {
    __m128 num[3], den[3], mem[3];
 
@@ -87,7 +87,7 @@ void filter_mem2_10(const float *x, const float *_num, const float *_den, float
    _mm_store_ss(_mem+9, mem[2]);
 }
 
-void filter_mem2_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
+void filter_mem16_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
 {
    __m128 num[2], den[2], mem[2];
 
@@ -130,18 +130,18 @@ void filter_mem2_8(const float *x, const float *_num, const float *_den, float *
 }
 
 
-#define OVERRIDE_FILTER_MEM2
-void filter_mem2(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
+#define OVERRIDE_FILTER_MEM16
+void filter_mem16(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
 {
    if(ord==10)
-      filter_mem2_10(x, _num, _den, y, N, ord, _mem);
+      filter_mem16_10(x, _num, _den, y, N, ord, _mem);
    else if (ord==8)
-      filter_mem2_8(x, _num, _den, y, N, ord, _mem);
+      filter_mem16_8(x, _num, _den, y, N, ord, _mem);
 }
 
 
 
-void iir_mem2_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
+void iir_mem16_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
 {
    __m128 den[3], mem[3];
 
@@ -190,7 +190,7 @@ void iir_mem2_10(const float *x, const float *_den, float *y, int N, int ord, fl
 }
 
 
-void iir_mem2_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
+void iir_mem16_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
 {
    __m128 den[2], mem[2];
 
@@ -229,17 +229,17 @@ void iir_mem2_8(const float *x, const float *_den, float *y, int N, int ord, flo
    _mm_storeu_ps(_mem+4, mem[1]);
 }
 
-#define OVERRIDE_IIR_MEM2
-void iir_mem2(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
+#define OVERRIDE_IIR_MEM16
+void iir_mem16(const float *x, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
 {
    if(ord==10)
-      iir_mem2_10(x, _den, y, N, ord, _mem);
+      iir_mem16_10(x, _den, y, N, ord, _mem);
    else if (ord==8)
-      iir_mem2_8(x, _den, y, N, ord, _mem);
+      iir_mem16_8(x, _den, y, N, ord, _mem);
 }
 
 
-void fir_mem2_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
+void fir_mem16_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
 {
    __m128 num[3], mem[3];
 
@@ -287,7 +287,7 @@ void fir_mem2_10(const float *x, const float *_num, float *y, int N, int ord, fl
    _mm_store_ss(_mem+9, mem[2]);
 }
 
-void fir_mem2_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
+void fir_mem16_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
 {
    __m128 num[2], mem[2];
 
@@ -326,11 +326,11 @@ void fir_mem2_8(const float *x, const float *_num, float *y, int N, int ord, flo
    _mm_storeu_ps(_mem+4, mem[1]);
 }
 
-#define OVERRIDE_FIR_MEM2
-void fir_mem2(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
+#define OVERRIDE_FIR_MEM16
+void fir_mem16(const float *x, const float *_num, float *y, int N, int ord, float *_mem, char *stack)
 {
    if(ord==10)
-      fir_mem2_10(x, _num, y, N, ord, _mem);
+      fir_mem16_10(x, _num, y, N, ord, _mem);
    else if (ord==8)
-      fir_mem2_8(x, _num, y, N, ord, _mem);
+      fir_mem16_8(x, _num, y, N, ord, _mem);
 }
diff --git a/libspeex/fixed_debug.h b/libspeex/fixed_debug.h
index 65c5712..d5c449f 100644
--- a/libspeex/fixed_debug.h
+++ b/libspeex/fixed_debug.h
@@ -74,53 +74,57 @@ static inline int NEG32(long long x)
    return res;
 }
 
-static inline short EXTRACT16(int x)
+#define EXTRACT16(x) _EXTRACT16(x, __FILE__, __LINE__)
+static inline short _EXTRACT16(int x, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(x))
    {
-      fprintf (stderr, "EXTRACT16: input is not short: %d\n", x);
+      fprintf (stderr, "EXTRACT16: input is not short: %d in %s: line %d\n", x, file, line);
    }
    res = x;
    spx_mips++;
    return res;
 }
 
-static inline int EXTEND32(int x)
+#define EXTEND32(x) _EXTEND32(x, __FILE__, __LINE__)
+static inline int _EXTEND32(int x, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(x))
    {
-      fprintf (stderr, "EXTRACT16: input is not short: %d\n", x);
+      fprintf (stderr, "EXTEND32: input is not short: %d in %s: line %d\n", x, file, line);
    }
    res = x;
    spx_mips++;
    return res;
 }
 
-static inline short SHR16(int a, int shift) 
+#define SHR16(a, shift) _SHR16(a, shift, __FILE__, __LINE__)
+static inline short _SHR16(int a, int shift, char *file, int line) 
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHR16: inputs are not short: %d %d\n", a, shift);
+      fprintf (stderr, "SHR16: inputs are not short: %d >> %d in %s: line %d\n", a, shift, file, line);
    }
    res = a>>shift;
    if (!VERIFY_SHORT(res))
-      fprintf (stderr, "SHR16: output is not short: %d\n", res);
+      fprintf (stderr, "SHR16: output is not short: %d in %s: line %d\n", res, file, line);
    spx_mips++;
    return res;
 }
-static inline short SHL16(int a, int shift) 
+#define SHL16(a, shift) _SHL16(a, shift, __FILE__, __LINE__)
+static inline short _SHL16(int a, int shift, char *file, int line) 
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHR16: inputs are not short: %d %d\n", a, shift);
+      fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
    }
    res = a<<shift;
    if (!VERIFY_SHORT(res))
-      fprintf (stderr, "SHR16: output is not short: %d\n", res);
+      fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
    spx_mips++;
    return res;
 }
@@ -134,7 +138,9 @@ static inline int SHR32(long long a, int shift)
    }
    res = a>>shift;
    if (!VERIFY_INT(res))
+   {
       fprintf (stderr, "SHR32: output is not int: %d\n", (int)res);
+   }
    spx_mips++;
    return res;
 }
@@ -143,62 +149,71 @@ static inline int SHL32(long long a, int shift)
    long long  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHR32: inputs are not int: %d %d\n", (int)a, shift);
+      fprintf (stderr, "SHL32: inputs are not int: %d %d\n", (int)a, shift);
    }
    res = a<<shift;
    if (!VERIFY_INT(res))
-      fprintf (stderr, "SHR32: output is not int: %d\n", (int)res);
+   {
+      fprintf (stderr, "SHL32: output is not int: %d\n", (int)res);
+   }
    spx_mips++;
    return res;
 }
 
+#define PSHR16(a,shift) (SHR16(ADD16((a),((1<<((shift))>>1))),shift))
+#define PSHR32(a,shift) (SHR32(ADD32((a),((1<<((shift))>>1))),shift))
+#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift)))
 
-#define PSHR16(a,shift) (SHR16(ADD16(a,(1<<((shift)-1))),shift))
-#define PSHR32(a,shift) (SHR32(ADD32(a,(1<<((shift)-1))),shift))
 #define SATURATE16(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 #define SATURATE32(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 
-#define SHR(a,shift) ((a) >> (shift))
-#define SHL(a,shift) ((a) << (shift))
+//#define SHR(a,shift) ((a) >> (shift))
+//#define SHL(a,shift) ((a) << (shift))
 
-static inline short ADD16(int a, int b) 
+#define ADD16(a, b) _ADD16(a, b, __FILE__, __LINE__)
+static inline short _ADD16(int a, int b, char *file, int line) 
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
    {
-      fprintf (stderr, "ADD16: inputs are not short: %d %d\n", a, b);
+      fprintf (stderr, "ADD16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
    }
    res = a+b;
    if (!VERIFY_SHORT(res))
-      fprintf (stderr, "ADD16: output is not short: %d+%d=%d\n", a,b,res);
+   {
+      fprintf (stderr, "ADD16: output is not short: %d+%d=%d in %s: line %d\n", a,b,res, file, line);
+   }
    spx_mips++;
    return res;
 }
-static inline short SUB16(int a, int b) 
+
+#define SUB16(a, b) _SUB16(a, b, __FILE__, __LINE__)
+static inline short _SUB16(int a, int b, char *file, int line) 
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
    {
-      fprintf (stderr, "SUB16: inputs are not short: %d %d\n", a, b);
+      fprintf (stderr, "SUB16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
    }
    res = a-b;
    if (!VERIFY_SHORT(res))
-      fprintf (stderr, "SUB16: output is not short: %d\n", res);
+      fprintf (stderr, "SUB16: output is not short: %d in %s: line %d\n", res, file, line);
    spx_mips++;
    return res;
 }
 
-static inline int ADD32(long long a, long long b) 
+#define ADD32(a, b) _ADD32(a, b, __FILE__, __LINE__)
+static inline int _ADD32(long long a, long long b, char *file, int line) 
 {
    long long res;
    if (!VERIFY_INT(a) || !VERIFY_INT(b))
    {
-      fprintf (stderr, "ADD32: inputs are not int: %d %d\n", (int)a, (int)b);
+      fprintf (stderr, "ADD32: inputs are not int: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
    }
    res = a+b;
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "ADD32: output is not int: %d\n", (int)res);
+      fprintf (stderr, "ADD32: output is not int: %d in %s: line %d\n", (int)res, file, line);
    }
    spx_mips++;
    return res;
@@ -220,8 +235,6 @@ static inline int SUB32(long long a, long long b)
 
 #define ADD64(a,b) (MIPS_INC(a)+(b))
 
-#define PSHR(a,shift) (SHR((a)+(1<<((shift)-1)),shift))
-
 /* result fits in 16 bits */
 static inline short MULT16_16_16(int a, int b) 
 {
@@ -237,36 +250,56 @@ static inline short MULT16_16_16(int a, int b)
    return res;
 }
 
-static inline int MULT16_16(int a, int b) 
+#define MULT16_16(a, b) _MULT16_16(a, b, __FILE__, __LINE__)
+static inline int _MULT16_16(int a, int b, char *file, int line) 
 {
    long long res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
    {
-      fprintf (stderr, "MULT16_16: inputs are not short: %d %d\n", a, b);
+      fprintf (stderr, "MULT16_16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line);
    }
    res = ((long long)a)*b;
    if (!VERIFY_INT(res))
-      fprintf (stderr, "MULT16_16: output is not int: %d\n", (int)res);
+      fprintf (stderr, "MULT16_16: output is not int: %d in %s: line %d\n", (int)res, file, line);
    spx_mips++;
    return res;
 }
 
 #define MAC16_16(c,a,b)     (spx_mips--,ADD32((c),MULT16_16((a),(b))))
-#define MAC16_16_Q11(c,a,b)     (ADD16((c),EXTRACT16(SHR32(MULT16_16((a),(b)),11))))
-#define MAC16_16_Q13(c,a,b)     (ADD16((c),EXTRACT16(SHR32(MULT16_16((a),(b)),13))))
-#define MAC16_16_P13(c,a,b)     (ADD32((c),SHR(ADD32(4096,MULT16_16((a),(b))),13)))
+#define MAC16_16_Q11(c,a,b)     (EXTRACT16(ADD16((c),EXTRACT16(SHR32(MULT16_16((a),(b)),11)))))
+#define MAC16_16_Q13(c,a,b)     (EXTRACT16(ADD16((c),EXTRACT16(SHR32(MULT16_16((a),(b)),13)))))
+#define MAC16_16_P13(c,a,b)     (EXTRACT16(ADD32((c),SHR32(ADD32(4096,MULT16_16((a),(b))),13))))
 
 
-static inline int MULT16_32_QX(int a, long long b, int Q)
+#define MULT16_32_QX(a, b, Q) _MULT16_32_QX(a, b, Q, __FILE__, __LINE__)
+static inline int _MULT16_32_QX(int a, long long b, int Q, char *file, int line)
 {
    long long res;
    if (!VERIFY_SHORT(a) || !VERIFY_INT(b))
    {
-      fprintf (stderr, "MULT16_32_Q%d: inputs are not short+int: %d %d\n", Q, (int)a, (int)b);
+      fprintf (stderr, "MULT16_32_Q%d: inputs are not short+int: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
    }
+   if (ABS32(b)>=(1<<(15+Q)))
+      fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);      
    res = (((long long)a)*(long long)b) >> Q;
    if (!VERIFY_INT(res))
-      fprintf (stderr, "MULT16_32_Q%d: output is not int: %d*%d=%d\n", Q, (int)a, (int)b,(int)res);
+      fprintf (stderr, "MULT16_32_Q%d: output is not int: %d*%d=%d in %s: line %d\n", Q, (int)a, (int)b,(int)res, file, line);
+   spx_mips+=5;
+   return res;
+}
+
+static inline int MULT16_32_PX(int a, long long b, int Q)
+{
+   long long res;
+   if (!VERIFY_SHORT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT16_32_P%d: inputs are not short+int: %d %d\n", Q, (int)a, (int)b);
+   }
+   if (ABS32(b)>=(1<<(15+Q)))
+      fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d\n", Q, (int)a, (int)b);      
+   res = ((((long long)a)*(long long)b) + ((1<<Q)>>1))>> Q;
+   if (!VERIFY_INT(res))
+      fprintf (stderr, "MULT16_32_P%d: output is not int: %d*%d=%d\n", Q, (int)a, (int)b,(int)res);
    spx_mips+=5;
    return res;
 }
@@ -278,6 +311,7 @@ static inline int MULT16_32_QX(int a, long long b, int Q)
 #define MULT16_32_Q13(a,b) MULT16_32_QX(a,b,13)
 #define MULT16_32_Q14(a,b) MULT16_32_QX(a,b,14)
 #define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)
+#define MULT16_32_P15(a,b) MULT16_32_PX(a,b,15)
 #define MAC16_32_Q15(c,a,b) ADD32((c),MULT16_32_Q15((a),(b)))
 
 static inline int SATURATE(int a, int b)
@@ -341,7 +375,9 @@ static inline short MULT16_16_Q15(int a, int b)
    res = ((long long)a)*b;
    res >>= 15;
    if (!VERIFY_SHORT(res))
+   {
       fprintf (stderr, "MULT16_16_Q15: output is not short: %d\n", (int)res);
+   }
    spx_mips+=3;
    return res;
 }
@@ -398,23 +434,24 @@ static inline short MULT16_16_P15(int a, int b)
    return res;
 }
 
+#define DIV32_16(a, b) _DIV32_16(a, b, __FILE__, __LINE__)
 
-static inline int DIV32_16(long long a, long long b) 
+static inline int _DIV32_16(long long a, long long b, char *file, int line) 
 {
    long long res;
    if (b==0)
    {
-      fprintf(stderr, "DIV32_16: divide by zero: %d/%d\n", (int)a, (int)b);
+      fprintf(stderr, "DIV32_16: divide by zero: %d/%d in %s: line %d\n", (int)a, (int)b, file, line);
       return 0;
    }
    if (!VERIFY_INT(a) || !VERIFY_SHORT(b))
    {
-      fprintf (stderr, "DIV32_16: inputs are not int/short: %d %d\n", (int)a, (int)b);
+      fprintf (stderr, "DIV32_16: inputs are not int/short: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
    }
    res = a/b;
    if (!VERIFY_SHORT(res))
    {
-      fprintf (stderr, "DIV32_16: output is not short: %d / %d = %d\n", (int)a,(int)b,(int)res);
+      fprintf (stderr, "DIV32_16: output is not short: %d / %d = %d in %s: line %d\n", (int)a,(int)b,(int)res, file, line);
       if (res>32767)
          res = 32767;
       if (res<-32768)
@@ -423,22 +460,24 @@ static inline int DIV32_16(long long a, long long b)
    spx_mips+=20;
    return res;
 }
-static inline int DIV32(long long a, long long b) 
+
+#define DIV32(a, b) _DIV32(a, b, __FILE__, __LINE__)
+static inline int _DIV32(long long a, long long b, char *file, int line) 
 {
    long long res;
    if (b==0)
    {
-      fprintf(stderr, "DIV32: divide by zero: %d/%d\n", (int)a, (int)b);
+      fprintf(stderr, "DIV32: divide by zero: %d/%d in %s: line %d\n", (int)a, (int)b, file, line);
       return 0;
    }
 
    if (!VERIFY_INT(a) || !VERIFY_INT(b))
    {
-      fprintf (stderr, "DIV32: inputs are not int/short: %d %d\n", (int)a, (int)b);
+      fprintf (stderr, "DIV32: inputs are not int/short: %d %d in %s: line %d\n", (int)a, (int)b, file, line);
    }
    res = a/b;
    if (!VERIFY_INT(res))
-      fprintf (stderr, "DIV32: output is not int: %d\n", (int)res);
+      fprintf (stderr, "DIV32: output is not int: %d in %s: line %d\n", (int)res, file, line);
    spx_mips+=36;
    return res;
 }
diff --git a/libspeex/fixed_generic.h b/libspeex/fixed_generic.h
index 375050c..2948177 100644
--- a/libspeex/fixed_generic.h
+++ b/libspeex/fixed_generic.h
@@ -46,14 +46,15 @@
 #define SHL16(a,shift) ((a) << (shift))
 #define SHR32(a,shift) ((a) >> (shift))
 #define SHL32(a,shift) ((a) << (shift))
-#define PSHR16(a,shift) (SHR16((a)+(1<<((shift)-1)),shift))
-#define PSHR32(a,shift) (SHR32((a)+(1<<((shift)-1)),shift))
+#define PSHR16(a,shift) (SHR16((a)+((1<<((shift))>>1)),shift))
+#define PSHR32(a,shift) (SHR32((a)+((1<<((shift))>>1)),shift))
+#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift)))
 #define SATURATE16(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 #define SATURATE32(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 
 #define SHR(a,shift) ((a) >> (shift))
 #define SHL(a,shift) ((spx_word32_t)(a) << (shift))
-#define PSHR(a,shift) (SHR((a)+(1<<((shift)-1)),shift))
+#define PSHR(a,shift) (SHR((a)+((1<<((shift))>>1)),shift))
 #define SATURATE(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 
 
@@ -77,6 +78,7 @@
 #define MULT16_32_Q11(a,b) ADD32(MULT16_16((a),SHR((b),11)), SHR(MULT16_16((a),((b)&0x000007ff)),11))
 #define MAC16_32_Q11(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),11)), SHR(MULT16_16((a),((b)&0x000007ff)),11)))
 
+#define MULT16_32_P15(a,b) ADD32(MULT16_16((a),SHR((b),15)), PSHR(MULT16_16((a),((b)&0x00007fff)),15))
 #define MULT16_32_Q15(a,b) ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))
 #define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
 
diff --git a/libspeex/jitter.c b/libspeex/jitter.c
index 6d5f2ad..2b64453 100644
--- a/libspeex/jitter.c
+++ b/libspeex/jitter.c
@@ -41,9 +41,12 @@
 #include <speex/speex.h>
 #include <speex/speex_bits.h>
 #include <speex/speex_jitter.h>
-#include <stdio.h>
 
-#define LATE_BINS 10
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define LATE_BINS 15
 #define MAX_MARGIN 30                     /**< Number of bins in margin histogram */
 
 #define SPEEX_JITTER_MAX_BUFFER_SIZE 200   /**< Maximum number of packets in jitter buffer */
@@ -68,7 +71,9 @@ struct JitterBuffer_ {
    int tick_size;                                                         /**< Output granularity                  */
    int reset_state;                                                       /**< True if state was just reset        */
    int buffer_margin;                                                     /**< How many frames we want to keep in the buffer (lower bound) */
-   
+   int late_cutoff;                                                       /**< How late must a packet be for it not to be considered at all */
+   int interp_requested;                                                  /**< An interpolation is requested by speex_jitter_update_delay() */
+
    int lost_count;                                                        /**< Number of consecutive lost packets  */
    float shortterm_margin[MAX_MARGIN];                                    /**< Short term margin histogram         */
    float longterm_margin[MAX_MARGIN];                                     /**< Long term margin histogram          */
@@ -86,6 +91,7 @@ JitterBuffer *jitter_buffer_init(int tick)
          jitter->buf[i]=NULL;
       jitter->tick_size = tick;
       jitter->buffer_margin = 1;
+      jitter->late_cutoff = 50;
       jitter_buffer_reset(jitter);
    }
    return jitter;
@@ -141,6 +147,7 @@ void jitter_buffer_put(JitterBuffer *jitter, const JitterBufferPacket *packet)
    /* Cleanup buffer (remove old packets that weren't played) */
    for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
    {
+      /* Make sure we don't discard a "just-late" packet in case we want to play it next (if we interpolate). */
       if (jitter->buf[i] && LE32(jitter->timestamp[i] + jitter->span[i], jitter->pointer_timestamp))
       {
          /*fprintf (stderr, "cleaned (not played)\n");*/
@@ -187,27 +194,33 @@ void jitter_buffer_put(JitterBuffer *jitter, const JitterBufferPacket *packet)
    jitter->span[i]=packet->span;
    jitter->len[i]=packet->len;
    
-   /* Adjust the buffer size depending on network conditions */
-   arrival_margin = (packet->timestamp - jitter->current_timestamp) - jitter->buffer_margin*jitter->tick_size;
+   /* Adjust the buffer size depending on network conditions.
+      The arrival margin is how much in advance (or late) the packet it */
+   arrival_margin = (((spx_int32_t)packet->timestamp) - ((spx_int32_t)jitter->current_timestamp))/jitter->tick_size - jitter->buffer_margin;
    
-   if (arrival_margin >= -LATE_BINS*jitter->tick_size)
+   if (arrival_margin >= -jitter->late_cutoff)
    {
+      /* Here we compute the histogram based on the time of arrival of the packet.
+         This is based on a (first-order) recursive average. We keep both a short-term
+         histogram and a long-term histogram */
       spx_int32_t int_margin;
+      /* First, apply the "damping" of the recursive average to all bins */
       for (i=0;i<MAX_MARGIN;i++)
       {
          jitter->shortterm_margin[i] *= .98;
          jitter->longterm_margin[i] *= .995;
       }
-      int_margin = LATE_BINS + arrival_margin/jitter->tick_size;
+      /* What histogram bin the packet should be counted in */
+      int_margin = LATE_BINS + arrival_margin;
       if (int_margin>MAX_MARGIN-1)
          int_margin = MAX_MARGIN-1;
-      if (int_margin>=0)
-      {
-         jitter->shortterm_margin[int_margin] += .02;
-         jitter->longterm_margin[int_margin] += .005;
-      }
+      if (int_margin<0)
+         int_margin = 0;
+      /* Add the packet to the right bin */
+      jitter->shortterm_margin[int_margin] += .02;
+      jitter->longterm_margin[int_margin] += .005;
    } else {
-      
+      /* Packet has arrived *way* too late, we pretty much consider it lost and not take it into account in the histogram */
       /*fprintf (stderr, "way too late = %d\n", arrival_margin);*/
       if (jitter->lost_count>20)
       {
@@ -229,9 +242,10 @@ void jitter_buffer_put(JitterBuffer *jitter, const JitterBufferPacket *packet)
 }
 
 /** Get one packet from the jitter buffer */
-int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint32_t *start_offset)
+int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_int32_t *start_offset)
 {
-   int i, j;
+   int i;
+   unsigned int j;
    float late_ratio_short;
    float late_ratio_long;
    float ontime_ratio_short;
@@ -241,6 +255,17 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
    int chunk_size;
    int incomplete = 0;
    
+   if (jitter->interp_requested)
+   {
+      jitter->interp_requested = 0;
+      if (start_offset)
+         *start_offset = 0;
+      packet->timestamp = jitter->pointer_timestamp;
+      packet->span = jitter->tick_size;
+      jitter->pointer_timestamp += jitter->tick_size;
+      packet->len = 0;
+      return JITTER_BUFFER_MISSING;
+   }
    if (LT32(jitter->current_timestamp+jitter->tick_size, jitter->pointer_timestamp))
    {
       jitter->current_timestamp = jitter->pointer_timestamp;
@@ -255,14 +280,17 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
    
    late_ratio_short = 0;
    late_ratio_long = 0;
+   /* Count the proportion of packets that are late */
    for (i=0;i<LATE_BINS;i++)
    {
       late_ratio_short += jitter->shortterm_margin[i];
       late_ratio_long += jitter->longterm_margin[i];
    }
+   /* Count the proportion of packets that are just on time */
    ontime_ratio_short = jitter->shortterm_margin[LATE_BINS];
    ontime_ratio_long = jitter->longterm_margin[LATE_BINS];
    early_ratio_short = early_ratio_long = 0;
+   /* Count the proportion of packets that are early */
    for (i=LATE_BINS+1;i<MAX_MARGIN;i++)
    {
       early_ratio_short += jitter->shortterm_margin[i];
@@ -274,42 +302,6 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
       /*fprintf (stderr, "%f %f\n", early_ratio_short + ontime_ratio_short + late_ratio_short, early_ratio_long + ontime_ratio_long + late_ratio_long);*/
    }
    
-   /* Adjusting the buffering */
-   
-   if (late_ratio_short > .1 || late_ratio_long > .03)
-   {
-      /* If too many packets are arriving late */
-      jitter->shortterm_margin[MAX_MARGIN-1] += jitter->shortterm_margin[MAX_MARGIN-2];
-      jitter->longterm_margin[MAX_MARGIN-1] += jitter->longterm_margin[MAX_MARGIN-2];
-      for (i=MAX_MARGIN-3;i>=0;i--)
-      {
-         jitter->shortterm_margin[i+1] = jitter->shortterm_margin[i];
-         jitter->longterm_margin[i+1] = jitter->longterm_margin[i];         
-      }
-      jitter->shortterm_margin[0] = 0;
-      jitter->longterm_margin[0] = 0;            
-      jitter->pointer_timestamp -= jitter->tick_size;
-      jitter->current_timestamp -= jitter->tick_size;
-      /*fprintf (stderr, "i");*/
-      /*fprintf (stderr, "interpolate (getting some slack)\n");*/
-   } else if (late_ratio_short + ontime_ratio_short < .005 && late_ratio_long + ontime_ratio_long < .01 && early_ratio_short > .8)
-   {
-      /* Many frames arriving early */
-      jitter->shortterm_margin[0] += jitter->shortterm_margin[1];
-      jitter->longterm_margin[0] += jitter->longterm_margin[1];
-      for (i=1;i<MAX_MARGIN-1;i++)
-      {
-         jitter->shortterm_margin[i] = jitter->shortterm_margin[i+1];
-         jitter->longterm_margin[i] = jitter->longterm_margin[i+1];         
-      }
-      jitter->shortterm_margin[MAX_MARGIN-1] = 0;
-      jitter->longterm_margin[MAX_MARGIN-1] = 0;      
-      /*fprintf (stderr, "drop frame\n");*/
-      /*fprintf (stderr, "d");*/
-      jitter->pointer_timestamp += jitter->tick_size;
-      jitter->current_timestamp += jitter->tick_size;
-      /*fprintf (stderr, "dropping packet (getting more aggressive)\n");*/
-   }
    
    /* Searching for the packet that fits best */
    
@@ -325,7 +317,7 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
    {
       for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
       {
-         if (jitter->buf[i] && jitter->timestamp[i]<=jitter->pointer_timestamp && GE32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp+chunk_size))
+         if (jitter->buf[i] && LE32(jitter->timestamp[i], jitter->pointer_timestamp) && GE32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp+chunk_size))
             break;
       }
    }
@@ -335,7 +327,7 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
    {
       for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
       {
-         if (jitter->buf[i] && jitter->timestamp[i]<=jitter->pointer_timestamp && GT32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp))
+         if (jitter->buf[i] && LE32(jitter->timestamp[i], jitter->pointer_timestamp) && GT32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp))
             break;
       }
    }
@@ -385,7 +377,7 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
       jitter->buf[i] = NULL;
       /* Set timestamp and span (if requested) */
       if (start_offset)
-         *start_offset = jitter->timestamp[i]-jitter->pointer_timestamp;
+         *start_offset = (spx_int32_t)jitter->timestamp[i]-(spx_int32_t)jitter->pointer_timestamp;
       packet->timestamp = jitter->timestamp[i];
       packet->span = jitter->span[i];
       /* Point at the end of the current packet */
@@ -409,6 +401,26 @@ int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint
    packet->span = jitter->tick_size;
    jitter->pointer_timestamp += chunk_size;
    packet->len = 0;
+   
+   /* Adjusting the buffering bssed on the amount of packets that are early/on time/late */   
+   if (late_ratio_short > .1 || late_ratio_long > .03)
+   {
+      /* If too many packets are arriving late */
+      jitter->shortterm_margin[MAX_MARGIN-1] += jitter->shortterm_margin[MAX_MARGIN-2];
+      jitter->longterm_margin[MAX_MARGIN-1] += jitter->longterm_margin[MAX_MARGIN-2];
+      for (i=MAX_MARGIN-3;i>=0;i--)
+      {
+         jitter->shortterm_margin[i+1] = jitter->shortterm_margin[i];
+         jitter->longterm_margin[i+1] = jitter->longterm_margin[i];         
+      }
+      jitter->shortterm_margin[0] = 0;
+      jitter->longterm_margin[0] = 0;            
+      jitter->pointer_timestamp -= jitter->tick_size;
+      jitter->current_timestamp -= jitter->tick_size;
+      /*fprintf (stderr, "i");*/
+      /*fprintf (stderr, "interpolate (getting some slack)\n");*/
+   }
+
    return JITTER_BUFFER_MISSING;
 
 }
@@ -424,7 +436,113 @@ void jitter_buffer_tick(JitterBuffer *jitter)
    jitter->current_timestamp += jitter->tick_size;
 }
 
+/* Let the jitter buffer know it's the right time to adjust the buffering delay to the network conditions */
+int jitter_buffer_update_delay(JitterBuffer *jitter, JitterBufferPacket *packet, spx_int32_t *start_offset)
+{
+   int i;
+   float late_ratio_short;
+   float late_ratio_long;
+   float ontime_ratio_short;
+   float ontime_ratio_long;
+   float early_ratio_short;
+   float early_ratio_long;
+   
+   if (LT32(jitter->current_timestamp+jitter->tick_size, jitter->pointer_timestamp))
+   {
+      jitter->current_timestamp = jitter->pointer_timestamp;
+      speex_warning("did you forget to call jitter_buffer_tick() by any chance?");
+   }
+   /*fprintf (stderr, "get packet %d %d\n", jitter->pointer_timestamp, jitter->current_timestamp);*/
+
+   /* FIXME: This should be only what remaining of the current tick */
+   late_ratio_short = 0;
+   late_ratio_long = 0;
+   /* Count the proportion of packets that are late */
+   for (i=0;i<LATE_BINS;i++)
+   {
+      late_ratio_short += jitter->shortterm_margin[i];
+      late_ratio_long += jitter->longterm_margin[i];
+   }
+   /* Count the proportion of packets that are just on time */
+   ontime_ratio_short = jitter->shortterm_margin[LATE_BINS];
+   ontime_ratio_long = jitter->longterm_margin[LATE_BINS];
+   early_ratio_short = early_ratio_long = 0;
+   /* Count the proportion of packets that are early */
+   for (i=LATE_BINS+1;i<MAX_MARGIN;i++)
+   {
+      early_ratio_short += jitter->shortterm_margin[i];
+      early_ratio_long += jitter->longterm_margin[i];
+   }
+   
+   /* Adjusting the buffering bssed on the amount of packets that are early/on time/late */   
+   if (late_ratio_short > .1 || late_ratio_long > .03)
+   {
+      /* If too many packets are arriving late */
+      jitter->shortterm_margin[MAX_MARGIN-1] += jitter->shortterm_margin[MAX_MARGIN-2];
+      jitter->longterm_margin[MAX_MARGIN-1] += jitter->longterm_margin[MAX_MARGIN-2];
+      for (i=MAX_MARGIN-3;i>=0;i--)
+      {
+         jitter->shortterm_margin[i+1] = jitter->shortterm_margin[i];
+         jitter->longterm_margin[i+1] = jitter->longterm_margin[i];         
+      }
+      jitter->shortterm_margin[0] = 0;
+      jitter->longterm_margin[0] = 0;            
+      jitter->pointer_timestamp -= jitter->tick_size;
+      jitter->current_timestamp -= jitter->tick_size;
+      jitter->interp_requested = 1;
+      return JITTER_BUFFER_ADJUST_INTERPOLATE;
+   
+   } else if (late_ratio_short + ontime_ratio_short < .005 && late_ratio_long + ontime_ratio_long < .01 && early_ratio_short > .8)
+   {
+      /* Many frames arriving early */
+      jitter->shortterm_margin[0] += jitter->shortterm_margin[1];
+      jitter->longterm_margin[0] += jitter->longterm_margin[1];
+      for (i=1;i<MAX_MARGIN-1;i++)
+      {
+         jitter->shortterm_margin[i] = jitter->shortterm_margin[i+1];
+         jitter->longterm_margin[i] = jitter->longterm_margin[i+1];         
+      }
+      jitter->shortterm_margin[MAX_MARGIN-1] = 0;
+      jitter->longterm_margin[MAX_MARGIN-1] = 0;      
+      /*fprintf (stderr, "drop frame\n");*/
+      /*fprintf (stderr, "d");*/
+      jitter->pointer_timestamp += jitter->tick_size;
+      jitter->current_timestamp += jitter->tick_size;
+      return JITTER_BUFFER_ADJUST_DROP;
+   }
+   
+   return JITTER_BUFFER_ADJUST_OK;
+}
 
+/* Used like the ioctl function to control the jitter buffer parameters */
+int jitter_buffer_ctl(JitterBuffer *jitter, int request, void *ptr)
+{
+   int count, i;
+   switch(request)
+   {
+      case JITTER_BUFFER_SET_MARGIN:
+         jitter->buffer_margin = *(spx_int32_t*)ptr;
+         break;
+      case JITTER_BUFFER_GET_MARGIN:
+         *(spx_int32_t*)ptr = jitter->buffer_margin;
+         break;
+      case JITTER_BUFFER_GET_AVALIABLE_COUNT:
+         count = 0;
+         for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
+         {
+            if (jitter->buf[i] && LE32(jitter->pointer_timestamp, jitter->timestamp[i]))
+            {
+               count++;
+            }
+         }
+         *(spx_int32_t*)ptr = count;
+         break;
+      default:
+         speex_warning_int("Unknown jitter_buffer_ctl request: ", request);
+         return -1;
+   }
+   return 0;
+}
 
 
 
@@ -499,6 +617,7 @@ void speex_jitter_get(SpeexJitter *jitter, short *out, int *current_timestamp)
             out[i]=0;
       }
    }
+   jitter_buffer_update_delay(jitter->packets, &packet, NULL);
    jitter_buffer_tick(jitter->packets);
 }
 
diff --git a/libspeex/kiss_fft.c b/libspeex/kiss_fft.c
index a0b3724..775a257 100644
--- a/libspeex/kiss_fft.c
+++ b/libspeex/kiss_fft.c
@@ -1,5 +1,6 @@
 /*
 Copyright (c) 2003-2004, Mark Borgerding
+Copyright (c) 2005-2007, Jean-Marc Valin
 
 All rights reserved.
 
@@ -24,121 +25,142 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  fixed or floating point complex numbers.  It also delares the kf_ internal functions.
  */
 
-static kiss_fft_cpx *scratchbuf=NULL;
-static size_t nscratchbuf=0;
-static kiss_fft_cpx *tmpbuf=NULL;
-static size_t ntmpbuf=0;
-
-#define CHECKBUF(buf,nbuf,n) \
-    do { \
-        if ( nbuf < (size_t)(n) ) {\
-            speex_free(buf); \
-            buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); \
-            nbuf = (size_t)(n); \
-        } \
-   }while(0)
-        
 static void kf_bfly2(
         kiss_fft_cpx * Fout,
         const size_t fstride,
         const kiss_fft_cfg st,
-        int m
+        int m,
+        int N,
+        int mm
         )
 {
     kiss_fft_cpx * Fout2;
-    kiss_fft_cpx * tw1 = st->twiddles;
+    kiss_fft_cpx * tw1;
     kiss_fft_cpx t;
-    Fout2 = Fout + m;
     if (!st->inverse) {
-       int i;
-       kiss_fft_cpx *x=Fout;
-       for (i=0;i<2*m;i++)
+       int i,j;
+       kiss_fft_cpx * Fout_beg = Fout;
+       for (i=0;i<N;i++)
+       {
+          Fout = Fout_beg + i*mm;
+          Fout2 = Fout + m;
+          tw1 = st->twiddles;
+          for(j=0;j<m;j++)
+          {
+             /* Almost the same as the code path below, except that we divide the input by two
+              (while keeping the best accuracy possible) */
+             spx_word32_t tr, ti;
+             tr = SHR32(SUB32(MULT16_16(Fout2->r , tw1->r),MULT16_16(Fout2->i , tw1->i)), 1);
+             ti = SHR32(ADD32(MULT16_16(Fout2->i , tw1->r),MULT16_16(Fout2->r , tw1->i)), 1);
+             tw1 += fstride;
+             Fout2->r = PSHR32(SUB32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
+             Fout2->i = PSHR32(SUB32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
+             Fout->r = PSHR32(ADD32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
+             Fout->i = PSHR32(ADD32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
+             ++Fout2;
+             ++Fout;
+          }
+       }
+    } else {
+       int i,j;
+       kiss_fft_cpx * Fout_beg = Fout;
+       for (i=0;i<N;i++)
        {
-          x[i].r = SHR(x[i].r,1);
-          x[i].i = SHR(x[i].i,1);
+          Fout = Fout_beg + i*mm;
+          Fout2 = Fout + m;
+          tw1 = st->twiddles;
+          for(j=0;j<m;j++)
+          {
+             C_MUL (t,  *Fout2 , *tw1);
+             tw1 += fstride;
+             C_SUB( *Fout2 ,  *Fout , t );
+             C_ADDTO( *Fout ,  t );
+             ++Fout2;
+             ++Fout;
+          }
        }
     }
-
-    do{
-        C_MUL (t,  *Fout2 , *tw1);
-        tw1 += fstride;
-        C_SUB( *Fout2 ,  *Fout , t );
-        C_ADDTO( *Fout ,  t );
-        ++Fout2;
-        ++Fout;
-    }while (--m);
 }
 
 static void kf_bfly4(
         kiss_fft_cpx * Fout,
         const size_t fstride,
         const kiss_fft_cfg st,
-        const size_t m
+        const size_t m,
+        int N,
+        int mm
         )
 {
     kiss_fft_cpx *tw1,*tw2,*tw3;
     kiss_fft_cpx scratch[6];
-    size_t k=m;
     const size_t m2=2*m;
     const size_t m3=3*m;
+    int i, j;
 
-    tw3 = tw2 = tw1 = st->twiddles;
-
-    if (!st->inverse) {
-       int i;
-       kiss_fft_cpx *x=Fout;
-       for (i=0;i<4*m;i++)
-       {
-          x[i].r = PSHR16(x[i].r,2);
-          x[i].i = PSHR16(x[i].i,2);
-       }
-    }
     if (st->inverse)
     {
-       do {
-          C_MUL(scratch[0],Fout[m] , *tw1 );
-          C_MUL(scratch[1],Fout[m2] , *tw2 );
-          C_MUL(scratch[2],Fout[m3] , *tw3 );
-          
-          C_SUB( scratch[5] , *Fout, scratch[1] );
-          C_ADDTO(*Fout, scratch[1]);
-          C_ADD( scratch[3] , scratch[0] , scratch[2] );
-          C_SUB( scratch[4] , scratch[0] , scratch[2] );
-          C_SUB( Fout[m2], *Fout, scratch[3] );
-          tw1 += fstride;
-          tw2 += fstride*2;
-          tw3 += fstride*3;
-          C_ADDTO( *Fout , scratch[3] );
-
-          Fout[m].r = scratch[5].r - scratch[4].i;
-          Fout[m].i = scratch[5].i + scratch[4].r;
-          Fout[m3].r = scratch[5].r + scratch[4].i;
-          Fout[m3].i = scratch[5].i - scratch[4].r;
-          ++Fout;
-       } while(--k);
+       kiss_fft_cpx * Fout_beg = Fout;
+       for (i=0;i<N;i++)
+       {
+          Fout = Fout_beg + i*mm;
+          tw3 = tw2 = tw1 = st->twiddles;
+          for (j=0;j<m;j++)
+          {
+             C_MUL(scratch[0],Fout[m] , *tw1 );
+             C_MUL(scratch[1],Fout[m2] , *tw2 );
+             C_MUL(scratch[2],Fout[m3] , *tw3 );
+             
+             C_SUB( scratch[5] , *Fout, scratch[1] );
+             C_ADDTO(*Fout, scratch[1]);
+             C_ADD( scratch[3] , scratch[0] , scratch[2] );
+             C_SUB( scratch[4] , scratch[0] , scratch[2] );
+             C_SUB( Fout[m2], *Fout, scratch[3] );
+             tw1 += fstride;
+             tw2 += fstride*2;
+             tw3 += fstride*3;
+             C_ADDTO( *Fout , scratch[3] );
+             
+             Fout[m].r = scratch[5].r - scratch[4].i;
+             Fout[m].i = scratch[5].i + scratch[4].r;
+             Fout[m3].r = scratch[5].r + scratch[4].i;
+             Fout[m3].i = scratch[5].i - scratch[4].r;
+             ++Fout;
+          }
+       }
     } else
     {
-       do {
-          C_MUL(scratch[0],Fout[m] , *tw1 );
-          C_MUL(scratch[1],Fout[m2] , *tw2 );
-          C_MUL(scratch[2],Fout[m3] , *tw3 );
-          
-          C_SUB( scratch[5] , *Fout, scratch[1] );
-          C_ADDTO(*Fout, scratch[1]);
-          C_ADD( scratch[3] , scratch[0] , scratch[2] );
-          C_SUB( scratch[4] , scratch[0] , scratch[2] );
-          C_SUB( Fout[m2], *Fout, scratch[3] );
-          tw1 += fstride;
-          tw2 += fstride*2;
-          tw3 += fstride*3;
-          C_ADDTO( *Fout , scratch[3] );
-          
-          Fout[m].r = scratch[5].r + scratch[4].i;
-          Fout[m].i = scratch[5].i - scratch[4].r;
-          Fout[m3].r = scratch[5].r - scratch[4].i;
-          Fout[m3].i = scratch[5].i + scratch[4].r;
-          ++Fout;
-       }while(--k);
+       kiss_fft_cpx * Fout_beg = Fout;
+       for (i=0;i<N;i++)
+       {
+          Fout = Fout_beg + i*mm;
+          tw3 = tw2 = tw1 = st->twiddles;
+          for (j=0;j<m;j++)
+          {
+             C_MUL4(scratch[0],Fout[m] , *tw1 );
+             C_MUL4(scratch[1],Fout[m2] , *tw2 );
+             C_MUL4(scratch[2],Fout[m3] , *tw3 );
+             
+             Fout->r = PSHR16(Fout->r, 2);
+             Fout->i = PSHR16(Fout->i, 2);
+             C_SUB( scratch[5] , *Fout, scratch[1] );
+             C_ADDTO(*Fout, scratch[1]);
+             C_ADD( scratch[3] , scratch[0] , scratch[2] );
+             C_SUB( scratch[4] , scratch[0] , scratch[2] );
+             Fout[m2].r = PSHR16(Fout[m2].r, 2);
+             Fout[m2].i = PSHR16(Fout[m2].i, 2);
+             C_SUB( Fout[m2], *Fout, scratch[3] );
+             tw1 += fstride;
+             tw2 += fstride*2;
+             tw3 += fstride*3;
+             C_ADDTO( *Fout , scratch[3] );
+             
+             Fout[m].r = scratch[5].r + scratch[4].i;
+             Fout[m].i = scratch[5].i - scratch[4].r;
+             Fout[m3].r = scratch[5].r - scratch[4].i;
+             Fout[m3].i = scratch[5].i + scratch[4].r;
+             ++Fout;
+          }
+       }
     }
 }
 
@@ -263,10 +285,13 @@ static void kf_bfly_generic(
     int u,k,q1,q;
     kiss_fft_cpx * twiddles = st->twiddles;
     kiss_fft_cpx t;
+    kiss_fft_cpx scratchbuf[17];
     int Norig = st->nfft;
 
-    CHECKBUF(scratchbuf,nscratchbuf,p);
-
+    /*CHECKBUF(scratchbuf,nscratchbuf,p);*/
+    if (p>17)
+       speex_error("KissFFT: max radix supported is 17");
+    
     for ( u=0; u<m; ++u ) {
         k=u;
         for ( q1=0 ; q1<p ; ++q1 ) {
@@ -291,6 +316,39 @@ static void kf_bfly_generic(
         }
     }
 }
+               
+static
+void kf_shuffle(
+         kiss_fft_cpx * Fout,
+         const kiss_fft_cpx * f,
+         const size_t fstride,
+         int in_stride,
+         int * factors,
+         const kiss_fft_cfg st
+            )
+{
+   const int p=*factors++; /* the radix  */
+   const int m=*factors++; /* stage's fft length/p */
+   
+    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
+   if (m==1)
+   {
+      int j;
+      for (j=0;j<p;j++)
+      {
+         Fout[j] = *f;
+         f += fstride*in_stride;
+      }
+   } else {
+      int j;
+      for (j=0;j<p;j++)
+      {
+         kf_shuffle( Fout , f, fstride*p, in_stride, factors,st);
+         f += fstride*in_stride;
+         Fout += m;
+      }
+   }
+}
 
 static
 void kf_work(
@@ -299,24 +357,34 @@ void kf_work(
         const size_t fstride,
         int in_stride,
         int * factors,
-        const kiss_fft_cfg st
+        const kiss_fft_cfg st,
+        int N,
+        int s2,
+        int m2
         )
 {
+   int i;
     kiss_fft_cpx * Fout_beg=Fout;
     const int p=*factors++; /* the radix  */
     const int m=*factors++; /* stage's fft length/p */
-    const kiss_fft_cpx * Fout_end = Fout + p*m;
-
-    if (m==1) {
-        do{
-            *Fout = *f;
-            f += fstride*in_stride;
-        }while(++Fout != Fout_end );
-    }else{
-        do{
-            kf_work( Fout , f, fstride*p, in_stride, factors,st);
-            f += fstride*in_stride;
-        }while( (Fout += m) != Fout_end );
+#if 0
+    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
+    if (m==1)
+    {
+    /*   int j;
+       for (j=0;j<p;j++)
+       {
+          Fout[j] = *f;
+          f += fstride*in_stride;
+       }*/
+    } else {
+       int j;
+       for (j=0;j<p;j++)
+       {
+          kf_work( Fout , f, fstride*p, in_stride, factors,st, N*p, fstride*in_stride, m);
+          f += fstride*in_stride;
+          Fout += m;
+       }
     }
 
     Fout=Fout_beg;
@@ -328,6 +396,36 @@ void kf_work(
         case 5: kf_bfly5(Fout,fstride,st,m); break; 
         default: kf_bfly_generic(Fout,fstride,st,m,p); break;
     }
+#else
+    /*printf ("fft %d %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N, m2);*/
+    if (m==1) 
+    {
+       /*for (i=0;i<N;i++)
+       {
+          int j;
+          Fout = Fout_beg+i*m2;
+          const kiss_fft_cpx * f2 = f+i*s2;
+          for (j=0;j<p;j++)
+          {
+             *Fout++ = *f2;
+             f2 += fstride*in_stride;
+          }
+       }*/
+    }else{
+       kf_work( Fout , f, fstride*p, in_stride, factors,st, N*p, fstride*in_stride, m);
+    }
+
+    
+       
+       
+       switch (p) {
+          case 2: kf_bfly2(Fout,fstride,st,m, N, m2); break;
+          case 3: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; kf_bfly3(Fout,fstride,st,m);} break; 
+          case 4: kf_bfly4(Fout,fstride,st,m, N, m2); break;
+          case 5: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; kf_bfly5(Fout,fstride,st,m);} break; 
+          default: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; kf_bfly_generic(Fout,fstride,st,m,p);} break;
+    }    
+#endif
 }
 
 /*  facbuf is populated by p1,m1,p2,m2, ...
@@ -338,8 +436,6 @@ static
 void kf_factor(int n,int * facbuf)
 {
     int p=4;
-    double floor_sqrt;
-    floor_sqrt = floor( sqrt((double)n) );
 
     /*factor out powers of 4, powers of 2, then any remaining primes */
     do {
@@ -349,7 +445,7 @@ void kf_factor(int n,int * facbuf)
                 case 2: p = 3; break;
                 default: p += 2; break;
             }
-            if (p > floor_sqrt)
+            if (p>32000 || (spx_int32_t)p*(spx_int32_t)p > n)
                 p = n;          /* no more factors, skip to end */
         }
         n /= p;
@@ -357,7 +453,6 @@ void kf_factor(int n,int * facbuf)
         *facbuf++ = n;
     } while (n > 1);
 }
-
 /*
  *
  * User-callable function to allocate all necessary storage space for the fft.
@@ -382,15 +477,22 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
         int i;
         st->nfft=nfft;
         st->inverse = inverse_fft;
-
+#ifdef FIXED_POINT
         for (i=0;i<nfft;++i) {
-            const double pi=3.14159265358979323846264338327;
-            double phase = ( -2*pi /nfft ) * i;
-            if (st->inverse)
-                phase *= -1;
-            kf_cexp(st->twiddles+i, phase );
+            spx_word32_t phase = i;
+            if (!st->inverse)
+                phase = -phase;
+            kf_cexp2(st->twiddles+i, DIV32(SHL32(phase,17),nfft));
         }
-
+#else
+        for (i=0;i<nfft;++i) {
+           const double pi=3.14159265358979323846264338327;
+           double phase = ( -2*pi /nfft ) * i;
+           if (st->inverse)
+              phase *= -1;
+           kf_cexp(st->twiddles+i, phase );
+        }
+#endif
         kf_factor(nfft,st->factors);
     }
     return st;
@@ -401,12 +503,15 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
     
 void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
 {
-    if (fin == fout) {
-        CHECKBUF(tmpbuf,ntmpbuf,st->nfft);
-        kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
-        speex_move(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
-    }else{
-        kf_work( fout, fin, 1,in_stride, st->factors,st );
+    if (fin == fout) 
+    {
+       speex_error("In-place FFT not supported");
+       /*CHECKBUF(tmpbuf,ntmpbuf,st->nfft);
+       kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
+       speex_move(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);*/
+    } else {
+       kf_shuffle( fout, fin, 1,in_stride, st->factors,st);
+       kf_work( fout, fin, 1,in_stride, st->factors,st, 1, in_stride, 1);
     }
 }
 
@@ -415,16 +520,3 @@ void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
     kiss_fft_stride(cfg,fin,fout,1);
 }
 
-
-/* not really necessary to call, but if someone is doing in-place ffts, they may want to free the 
-   buffers from CHECKBUF
- */ 
-void kiss_fft_cleanup(void)
-{
-    speex_free(scratchbuf);
-    scratchbuf = NULL;
-    nscratchbuf=0;
-    speex_free(tmpbuf);
-    tmpbuf=NULL;
-    ntmpbuf=0;
-}
diff --git a/libspeex/kiss_fftr.c b/libspeex/kiss_fftr.c
index b90b725..392945c 100644
--- a/libspeex/kiss_fftr.c
+++ b/libspeex/kiss_fftr.c
@@ -58,13 +58,22 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme
     st->super_twiddles = st->tmpbuf + nfft;
     kiss_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
 
-    for (i = 0; i < nfft; ++i) {
-        double phase =
-            -3.14159265358979323846264338327 * ((double) i / nfft + .5);
-        if (inverse_fft)
-            phase *= -1;
-        kf_cexp (st->super_twiddles+i,phase);
+#ifdef FIXED_POINT
+    for (i=0;i<nfft;++i) {
+       spx_word32_t phase = i+(nfft>>1);
+       if (!inverse_fft)
+          phase = -phase;
+       kf_cexp2(st->super_twiddles+i, DIV32(SHL32(phase,16),nfft));
     }
+#else
+    for (i=0;i<nfft;++i) {
+       const double pi=3.14159265358979323846264338327;
+       double phase = pi*(((double)i) /nfft + .5);
+       if (!inverse_fft)
+          phase = -phase;
+       kf_cexp(st->super_twiddles+i, phase );
+    }
+#endif
     return st;
 }
 
@@ -75,8 +84,7 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
     kiss_fft_cpx fpnk,fpk,f1k,f2k,tw,tdc;
 
     if ( st->substate->inverse) {
-        speex_warning("kiss fft usage error: improper alloc\n");
-        exit(1);
+        speex_error("kiss fft usage error: improper alloc\n");
     }
 
     ncfft = st->substate->nfft;
@@ -124,14 +132,13 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
     }
 }
 
-void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
+void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata, kiss_fft_scalar *timedata)
 {
     /* input buffer timedata is stored row-wise */
     int k, ncfft;
 
     if (st->substate->inverse == 0) {
-        speex_warning ("kiss fft usage error: improper alloc\n");
-        exit (1);
+        speex_error ("kiss fft usage error: improper alloc\n");
     }
 
     ncfft = st->substate->nfft;
@@ -161,3 +168,129 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t
     }
     kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
 }
+
+void kiss_fftr2(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_scalar *freqdata)
+{
+   /* input buffer timedata is stored row-wise */
+   int k,ncfft;
+   kiss_fft_cpx f2k,tdc;
+   spx_word32_t f1kr, f1ki, twr, twi;
+
+   if ( st->substate->inverse) {
+      speex_error("kiss fft usage error: improper alloc\n");
+   }
+
+   ncfft = st->substate->nfft;
+
+   /*perform the parallel fft of two real signals packed in real,imag*/
+   kiss_fft( st->substate , (const kiss_fft_cpx*)timedata, st->tmpbuf );
+    /* The real part of the DC element of the frequency spectrum in st->tmpbuf
+   * contains the sum of the even-numbered elements of the input time sequence
+   * The imag part is the sum of the odd-numbered elements
+   *
+   * The sum of tdc.r and tdc.i is the sum of the input time sequence. 
+   *      yielding DC of input time sequence
+   * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1... 
+   *      yielding Nyquist bin of input time sequence
+    */
+ 
+   tdc.r = st->tmpbuf[0].r;
+   tdc.i = st->tmpbuf[0].i;
+   C_FIXDIV(tdc,2);
+   CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
+   CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
+   freqdata[0] = tdc.r + tdc.i;
+   freqdata[2*ncfft-1] = tdc.r - tdc.i;
+
+   for ( k=1;k <= ncfft/2 ; ++k )
+   {
+      /*fpk    = st->tmpbuf[k]; 
+      fpnk.r =   st->tmpbuf[ncfft-k].r;
+      fpnk.i = - st->tmpbuf[ncfft-k].i;
+      C_FIXDIV(fpk,2);
+      C_FIXDIV(fpnk,2);
+
+      C_ADD( f1k, fpk , fpnk );
+      C_SUB( f2k, fpk , fpnk );
+      
+      C_MUL( tw , f2k , st->super_twiddles[k]);
+
+      freqdata[2*k-1] = HALF_OF(f1k.r + tw.r);
+      freqdata[2*k] = HALF_OF(f1k.i + tw.i);
+      freqdata[2*(ncfft-k)-1] = HALF_OF(f1k.r - tw.r);
+      freqdata[2*(ncfft-k)] = HALF_OF(tw.i - f1k.i);
+      */
+
+      /*f1k.r = PSHR32(ADD32(EXTEND32(st->tmpbuf[k].r), EXTEND32(st->tmpbuf[ncfft-k].r)),1);
+      f1k.i = PSHR32(SUB32(EXTEND32(st->tmpbuf[k].i), EXTEND32(st->tmpbuf[ncfft-k].i)),1);
+      f2k.r = PSHR32(SUB32(EXTEND32(st->tmpbuf[k].r), EXTEND32(st->tmpbuf[ncfft-k].r)),1);
+      f2k.i = SHR32(ADD32(EXTEND32(st->tmpbuf[k].i), EXTEND32(st->tmpbuf[ncfft-k].i)),1);
+      
+      C_MUL( tw , f2k , st->super_twiddles[k]);
+
+      freqdata[2*k-1] = HALF_OF(f1k.r + tw.r);
+      freqdata[2*k] = HALF_OF(f1k.i + tw.i);
+      freqdata[2*(ncfft-k)-1] = HALF_OF(f1k.r - tw.r);
+      freqdata[2*(ncfft-k)] = HALF_OF(tw.i - f1k.i);
+   */
+      f2k.r = SHR32(SUB32(EXTEND32(st->tmpbuf[k].r), EXTEND32(st->tmpbuf[ncfft-k].r)),1);
+      f2k.i = PSHR32(ADD32(EXTEND32(st->tmpbuf[k].i), EXTEND32(st->tmpbuf[ncfft-k].i)),1);
+      
+      f1kr = SHL32(ADD32(EXTEND32(st->tmpbuf[k].r), EXTEND32(st->tmpbuf[ncfft-k].r)),13);
+      f1ki = SHL32(SUB32(EXTEND32(st->tmpbuf[k].i), EXTEND32(st->tmpbuf[ncfft-k].i)),13);
+      
+      twr = SHR32(SUB32(MULT16_16(f2k.r,st->super_twiddles[k].r),MULT16_16(f2k.i,st->super_twiddles[k].i)), 1);
+      twi = SHR32(ADD32(MULT16_16(f2k.i,st->super_twiddles[k].r),MULT16_16(f2k.r,st->super_twiddles[k].i)), 1);
+      
+#ifdef FIXED_POINT
+      freqdata[2*k-1] = PSHR32(f1kr + twr, 15);
+      freqdata[2*k] = PSHR32(f1ki + twi, 15);
+      freqdata[2*(ncfft-k)-1] = PSHR32(f1kr - twr, 15);
+      freqdata[2*(ncfft-k)] = PSHR32(twi - f1ki, 15);
+#else
+      freqdata[2*k-1] = .5f*(f1kr + twr);
+      freqdata[2*k] = .5f*(f1ki + twi);
+      freqdata[2*(ncfft-k)-1] = .5f*(f1kr - twr);
+      freqdata[2*(ncfft-k)] = .5f*(twi - f1ki);
+      
+#endif
+   }
+}
+
+void kiss_fftri2(kiss_fftr_cfg st,const kiss_fft_scalar *freqdata,kiss_fft_scalar *timedata)
+{
+   /* input buffer timedata is stored row-wise */
+   int k, ncfft;
+
+   if (st->substate->inverse == 0) {
+      speex_error ("kiss fft usage error: improper alloc\n");
+   }
+
+   ncfft = st->substate->nfft;
+
+   st->tmpbuf[0].r = freqdata[0] + freqdata[2*ncfft-1];
+   st->tmpbuf[0].i = freqdata[0] - freqdata[2*ncfft-1];
+   /*C_FIXDIV(st->tmpbuf[0],2);*/
+
+   for (k = 1; k <= ncfft / 2; ++k) {
+      kiss_fft_cpx fk, fnkc, fek, fok, tmp;
+      fk.r = freqdata[2*k-1];
+      fk.i = freqdata[2*k];
+      fnkc.r = freqdata[2*(ncfft - k)-1];
+      fnkc.i = -freqdata[2*(ncfft - k)];
+        /*C_FIXDIV( fk , 2 );
+      C_FIXDIV( fnkc , 2 );*/
+
+      C_ADD (fek, fk, fnkc);
+      C_SUB (tmp, fk, fnkc);
+      C_MUL (fok, tmp, st->super_twiddles[k]);
+      C_ADD (st->tmpbuf[k],     fek, fok);
+      C_SUB (st->tmpbuf[ncfft - k], fek, fok);
+#ifdef USE_SIMD        
+      st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
+#else
+      st->tmpbuf[ncfft - k].i *= -1;
+#endif
+   }
+   kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+}
diff --git a/libspeex/kiss_fftr.h b/libspeex/kiss_fftr.h
index 2e8351a..7bfb423 100644
--- a/libspeex/kiss_fftr.h
+++ b/libspeex/kiss_fftr.h
@@ -32,7 +32,12 @@ void kiss_fftr(kiss_fftr_cfg cfg,const kiss_fft_scalar *timedata,kiss_fft_cpx *f
  output freqdata has nfft/2+1 complex points
 */
 
+void kiss_fftr2(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_scalar *freqdata);
+
 void kiss_fftri(kiss_fftr_cfg cfg,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata);
+
+void kiss_fftri2(kiss_fftr_cfg st,const kiss_fft_scalar *freqdata, kiss_fft_scalar *timedata);
+
 /*
  input freqdata has  nfft/2+1 complex points
  output timedata has nfft scalar points
diff --git a/libspeex/lbr_48k_tables.c b/libspeex/lbr_48k_tables.c
index 2e6db3f..d4d80dc 100644
--- a/libspeex/lbr_48k_tables.c
+++ b/libspeex/lbr_48k_tables.c
@@ -34,74 +34,74 @@
 #endif
 
 
-int dummy_epic_48k_variable=0;
+const int dummy_epic_48k_variable=0;
 #ifdef EPIC_48K
 
-const signed char gain_cdbk_ulbr[192] = {
--31, -48, -30, 
--19, -10, -18, 
--33, -22, -45, 
--5, -56, -43, 
--30, -56, -3, 
--59, -17, -52, 
--41, -60, -58, 
--64, -47, -22, 
--30, -31, -31, 
--29, -14, -31, 
--22, -37, -58, 
--31, -44, 13, 
--37, 0, 1, 
--46, -55, -35, 
--56, -14, -53, 
--8, 1, -36, 
--29, -15, -27, 
--29, -39, -28, 
--43, -5, 3, 
--51, -27, -54, 
-10, -46, -36, 
-3, -3, -42, 
--27, 16, -22, 
--34, -52, 13, 
--31, -21, -28, 
--34, -45, -40, 
--20, -48, 4, 
--40, -27, 16, 
--6, 11, -44, 
--35, 12, -5, 
-19, -33, -37, 
--29, 18, -32, 
--29, -23, -19, 
-16, -47, -28, 
--34, -30, 17, 
--20, 2, -26, 
--38, -40, -36, 
-15, -14, -40, 
--39, 14, -9, 
--15, 25, -39, 
--26, 19, -32, 
--39, 17, -14, 
-10, -36, -26, 
-14, -13, -40, 
--29, -21, -12, 
--8, 19, -39, 
--36, -18, 15, 
--32, -38, -38, 
--19, 4, -23, 
--38, -7, 11, 
-9, -10, -39, 
--37, 24, -19, 
--34, -5, -8, 
--20, 23, -41, 
--4, 17, -31, 
--17, -26, -26, 
--24, 28, -36, 
--7, 15, -39, 
--42, 16, -11, 
--29, 14, -6, 
--36, 28, -27, 
--21, 5, -26, 
-11, -9, -39, 
--38, -7, 13, 
+const signed char gain_cdbk_ulbr[256] = {
+-31, -48, -30, 10,
+-19, -10, -18, 25,
+-33, -22, -45, 12,
+-5, -56, -43, 31,
+-30, -56, -3, 28,
+-59, -17, -52, 31,
+-41, -60, -58, 32,
+-64, -47, -22, 29,
+-30, -31, -31, 2,
+-29, -14, -31, 11,
+-22, -37, -58, 21,
+-31, -44, 13, 29,
+-37, 0, 1, 35,
+-46, -55, -35, 20, 
+-56, -14, -53, 32,
+-8, 1, -36, 31,
+-29, -15, -27, 13,
+-29, -39, -28, 7,
+-43, -5, 3, 37,
+-51, -27, -54, 23,
+10, -46, -36, 30,
+3, -3, -42, 37,
+-27, 16, -22, 32,
+-34, -52, 13, 34,
+-31, -21, -28, 8,
+-34, -45, -40, 12,
+-20, -48, 4, 32,
+-40, -27, 16, 31,
+-6, 11, -44, 41,
+-35, 12, -5, 37,
+19, -33, -37, 29,
+-29, 18, -32, 27,
+-29, -23, -19, 13,
+16, -47, -28, 34,
+-34, -30, 17, 27,
+-20, 2, -26, 26,
+-38, -40, -36, 9,
+15, -14, -40, 37,
+-39, 14, -9, 38,
+-15, 25, -39, 41,
+-26, 19, -32, 29,
+-39, 17, -14, 37,
+10, -36, -26, 26,
+14, -13, -40, 37,
+-29, -21, -12, 17,
+-8, 19, -39, 41,
+-36, -18, 15, 33,
+-32, -38, -38, 6,
+-19, 4, -23, 29,
+-38, -7, 11, 37,
+9, -10, -39, 35,
+-37, 24, -19, 37,
+-34, -5, -8, 27,
+-20, 23, -41, 38,
+-4, 17, -31, 39,
+-17, -26, -26, 14,
+-24, 28, -36, 36,
+-7, 15, -39, 40,
+-42, 16, -11, 40,
+-29, 14, -6, 38,
+-36, 28, -27, 35,
+-21, 5, -26, 27,
+11, -9, -39, 37,
+-38, -7, 13, 38
 };
 
 
diff --git a/libspeex/lsp.c b/libspeex/lsp.c
index 3fdc08a..a73d883 100644
--- a/libspeex/lsp.c
+++ b/libspeex/lsp.c
@@ -509,7 +509,7 @@ void lsp_to_lpc(spx_lsp_t *freq,spx_coef_t *ak,int lpcrdr, char *stack)
       
       /* hard limit ak's to +/- 32767 */
 
-      if (a < -32767) a = 32767;
+      if (a < -32767) a = -32767;
       if (a > 32767) a = 32767;
       ak[j-1] = (short)a;
      
diff --git a/libspeex/ltp.c b/libspeex/ltp.c
index 27e4f4d..fa77da2 100644
--- a/libspeex/ltp.c
+++ b/libspeex/ltp.c
@@ -176,20 +176,56 @@ void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *p
    VARDECL(spx_word32_t *best_ener);
    spx_word32_t e0;
    VARDECL(spx_word32_t *corr);
+#ifdef FIXED_POINT
+   /* In fixed-point, we need only one (temporary) array of 32-bit values and two (corr16, ener16) 
+      arrays for (normalized) 16-bit values */
+   VARDECL(spx_word16_t *corr16);
+   VARDECL(spx_word16_t *ener16);
+   spx_word32_t *energy;
+   int cshift=0, eshift=0;
+   int scaledown = 0;
+   ALLOC(corr16, end-start+1, spx_word16_t);
+   ALLOC(ener16, end-start+1, spx_word16_t);
+   ALLOC(corr, end-start+1, spx_word32_t);
+   energy = corr;
+#else
+   /* In floating-point, we need to float arrays and no normalized copies */
    VARDECL(spx_word32_t *energy);
-
+   spx_word16_t *corr16;
+   spx_word16_t *ener16;
+   ALLOC(energy, end-start+2, spx_word32_t);
+   ALLOC(corr, end-start+1, spx_word32_t);
+   corr16 = corr;
+   ener16 = energy;
+#endif
+   
    ALLOC(best_score, N, spx_word32_t);
    ALLOC(best_ener, N, spx_word32_t);
-   ALLOC(corr, end-start+1, spx_word32_t);
-   ALLOC(energy, end-start+2, spx_word32_t);
-
    for (i=0;i<N;i++)
    {
         best_score[i]=-1;
         best_ener[i]=0;
         pitch[i]=start;
    }
-
+   
+#ifdef FIXED_POINT
+   for (i=-end;i<len;i++)
+   {
+      if (ABS16(sw[i])>16383)
+      {
+         scaledown=1;
+         break;
+      }
+   }
+   /* If the weighted input is close to saturation, then we scale it down */
+   if (scaledown)
+   {
+      for (i=-end;i<len;i++)
+      {
+         sw[i]=SHR16(sw[i],1);
+      }
+   }      
+#endif
    energy[0]=inner_prod(sw-start, sw-start, len);
    e0=inner_prod(sw, sw, len);
    for (i=start;i<end;i++)
@@ -199,59 +235,42 @@ void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *p
       if (energy[i-start+1] < 0)
          energy[i-start+1] = 0;
    }
-
+   
+#ifdef FIXED_POINT
+   eshift = normalize16(energy, ener16, 32766, end-start+1);
+#endif
+   
+   /* In fixed-point, this actually overrites the energy array (aliased to corr) */
    pitch_xcorr(sw, sw-end, corr, len, end-start+1, stack);
-
-   /* FIXME: Fixed-point and floating-point code should be merged */
+   
 #ifdef FIXED_POINT
+   /* Normalize to 180 so we can square it and it still fits in 16 bits */
+   cshift = normalize16(corr, corr16, 180, end-start+1);
+   /* If we scaled weighted input down, we need to scale it up again (OK, so we've just lost the LSB, who cares?) */
+   if (scaledown)
    {
-      VARDECL(spx_word16_t *corr16);
-      VARDECL(spx_word16_t *ener16);
-      ALLOC(corr16, end-start+1, spx_word16_t);
-      ALLOC(ener16, end-start+1, spx_word16_t);
-      /* Normalize to 180 so we can square it and it still fits in 16 bits */
-      normalize16(corr, corr16, 180, end-start+1);
-      normalize16(energy, ener16, 180, end-start+1);
-
-      for (i=start;i<=end;i++)
+      for (i=-end;i<len;i++)
       {
-         spx_word16_t tmp = MULT16_16_16(corr16[i-start],corr16[i-start]);
-         /* Instead of dividing the tmp by the energy, we multiply on the other side */
-         if (MULT16_16(tmp,best_ener[N-1])>MULT16_16(best_score[N-1],ADD16(1,ener16[i-start])))
-         {
-            /* We can safely put it last and then check */
-            best_score[N-1]=tmp;
-            best_ener[N-1]=ener16[i-start]+1;
-            pitch[N-1]=i;
-            /* Check if it comes in front of others */
-            for (j=0;j<N-1;j++)
-            {
-               if (MULT16_16(tmp,best_ener[j])>MULT16_16(best_score[j],ADD16(1,ener16[i-start])))
-               {
-                  for (k=N-1;k>j;k--)
-                  {
-                     best_score[k]=best_score[k-1];
-                     best_ener[k]=best_ener[k-1];
-                     pitch[k]=pitch[k-1];
-                  }
-                  best_score[j]=tmp;
-                  best_ener[j]=ener16[i-start]+1;
-                  pitch[j]=i;
-                  break;
-               }
-            }
-         }
+         sw[i]=SHL16(sw[i],1);
       }
-   }
-#else
+   }      
+#endif
+
+   /* Search for the best pitch prediction gain */
    for (i=start;i<=end;i++)
    {
-      float tmp = corr[i-start]*corr[i-start];
-      if (tmp*best_ener[N-1]>best_score[N-1]*(1+energy[i-start]))
+      spx_word16_t tmp = MULT16_16_16(corr16[i-start],corr16[i-start]);
+      /* Instead of dividing the tmp by the energy, we multiply on the other side */
+      if (MULT16_16(tmp,best_ener[N-1])>MULT16_16(best_score[N-1],ADD16(1,ener16[i-start])))
       {
-         for (j=0;j<N;j++)
+         /* We can safely put it last and then check */
+         best_score[N-1]=tmp;
+         best_ener[N-1]=ener16[i-start]+1;
+         pitch[N-1]=i;
+         /* Check if it comes in front of others */
+         for (j=0;j<N-1;j++)
          {
-            if (tmp*best_ener[j]>best_score[j]*(1+energy[i-start]))
+            if (MULT16_16(tmp,best_ener[j])>MULT16_16(best_score[j],ADD16(1,ener16[i-start])))
             {
                for (k=N-1;k>j;k--)
                {
@@ -260,29 +279,30 @@ void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *p
                   pitch[k]=pitch[k-1];
                }
                best_score[j]=tmp;
-               best_ener[j]=energy[i-start]+1;
+               best_ener[j]=ener16[i-start]+1;
                pitch[j]=i;
                break;
             }
          }
       }
    }
-#endif
-
-   /* Compute open-loop gain */
+   
+   /* Compute open-loop gain if necessary */
    if (gain)
    {
-       for (j=0;j<N;j++)
-       {
-          spx_word16_t g;
-          i=pitch[j];
-          g = DIV32(corr[i-start], 10+SHR32(MULT16_16(spx_sqrt(e0),spx_sqrt(energy[i-start])),6));
-          /* FIXME: g = max(g,corr/energy) */
-                   if (g<0)
-                   g = 0;
-             gain[j]=g;
-       }
+      for (j=0;j<N;j++)
+      {
+         spx_word16_t g;
+         i=pitch[j];
+         g = DIV32(SHL32(EXTEND32(corr16[i-start]),cshift), 10+SHR32(MULT16_16(spx_sqrt(e0),spx_sqrt(SHL32(EXTEND32(ener16[i-start]),eshift))),6));
+         /* FIXME: g = max(g,corr/energy) */
+         if (g<0)
+            g = 0;
+         gain[j]=g;
+      }
    }
+
+
 }
 #endif
 
@@ -342,7 +362,8 @@ const spx_word16_t *r,
 spx_word16_t *new_target,
 int  *cdbk_index,
 int plc_tuning,
-spx_word32_t cumul_gain
+spx_word32_t cumul_gain,
+int scaledown
 )
 {
    int i,j;
@@ -366,6 +387,9 @@ spx_word32_t cumul_gain
    x[1]=tmp1+nsf;
    x[2]=tmp1+2*nsf;
    
+   for (j=0;j<nsf;j++)
+      new_target[j] = target[j];
+
    {
       VARDECL(spx_mem_t *mm);
       int pp=pitch-1;
@@ -379,6 +403,16 @@ spx_word32_t cumul_gain
          else
             e[j]=0;
       }
+#ifdef FIXED_POINT
+      /* Scale target and excitation down if needed (avoiding overflow) */
+      if (scaledown)
+      {
+         for (j=0;j<nsf;j++)
+            e[j] = SHR16(e[j],1);
+         for (j=0;j<nsf;j++)
+            new_target[j] = SHR16(new_target[j],1);
+      }
+#endif
       for (j=0;j<p;j++)
          mm[j] = 0;
       iir_mem16(e, ak, e, nsf, p, mm, stack);
@@ -391,13 +425,18 @@ spx_word32_t cumul_gain
    for (i=1;i>=0;i--)
    {
       spx_word16_t e0=exc2[-pitch-1+i];
+#ifdef FIXED_POINT
+      /* Scale excitation down if needed (avoiding overflow) */
+      if (scaledown)
+         e0 = SHR16(e0,1);
+#endif
       x[i][0]=MULT16_16_Q14(r[0], e0);
       for (j=0;j<nsf-1;j++)
          x[i][j+1]=ADD32(x[i+1][j],MULT16_16_P14(r[j+1], e0));
    }
 
    for (i=0;i<3;i++)
-      corr[i]=inner_prod(x[i],target,nsf);
+      corr[i]=inner_prod(x[i],new_target,nsf);
    for (i=0;i<3;i++)
       for (j=0;j<=i;j++)
          A[i][j]=A[j][i]=inner_prod(x[i],x[j],nsf);
@@ -478,7 +517,7 @@ spx_word32_t cumul_gain
    {
       spx_word32_t tmp = ADD32(ADD32(MULT16_16(gain[0],x[2][i]),MULT16_16(gain[1],x[1][i])),
                             MULT16_16(gain[2],x[0][i]));
-      new_target[i] = SUB16(target[i], EXTRACT16(PSHR32(tmp,6)));
+      new_target[i] = SUB16(new_target[i], EXTRACT16(PSHR32(tmp,6)));
    }
    err = inner_prod(new_target, new_target, nsf);
 
@@ -520,7 +559,8 @@ spx_word32_t *cumul_gain
    const ltp_params *params;
    const signed char *gain_cdbk;
    int   gain_cdbk_size;
-   
+   int scaledown=0;
+         
    VARDECL(int *nbest);
    
    params = (const ltp_params*) par;
@@ -545,6 +585,25 @@ spx_word32_t *cumul_gain
       return start;
    }
    
+#ifdef FIXED_POINT
+   /* Check if we need to scale everything down in the pitch search to avoid overflows */
+   for (i=0;i<nsf;i++)
+   {
+      if (ABS16(target[i])>16383)
+      {
+         scaledown=1;
+         break;
+      }
+   }
+   for (i=-end;i<nsf;i++)
+   {
+      if (ABS16(exc2[i])>16383)
+      {
+         scaledown=1;
+         break;
+      }
+   }
+#endif
    if (N>end-start+1)
       N=end-start+1;
    if (end != start)
@@ -562,7 +621,7 @@ spx_word32_t *cumul_gain
       for (j=0;j<nsf;j++)
          exc[j]=0;
       err=pitch_gain_search_3tap(target, ak, awk1, awk2, exc, gain_cdbk, gain_cdbk_size, pitch, p, nsf,
-                                 bits, stack, exc2, r, new_target, &cdbk_index, plc_tuning, *cumul_gain);
+                                 bits, stack, exc2, r, new_target, &cdbk_index, plc_tuning, *cumul_gain, scaledown);
       if (err<best_err || best_err<0)
       {
          for (j=0;j<nsf;j++)
@@ -588,7 +647,14 @@ spx_word32_t *cumul_gain
       exc[i]=best_exc[i];
    for (i=0;i<nsf;i++)
       target[i]=best_target[i];
-
+#ifdef FIXED_POINT
+   /* Scale target back up if needed */
+   if (scaledown)
+   {
+      for (i=0;i<nsf;i++)
+         target[i]=SHL16(target[i],1);
+   }
+#endif
    return pitch;
 }
 
@@ -717,8 +783,8 @@ spx_word32_t *cumul_gain
 )
 {
    int i;
-   VARDECL(spx_sig_t *res);
-   ALLOC(res, nsf, spx_sig_t);
+   VARDECL(spx_word16_t *res);
+   ALLOC(res, nsf, spx_word16_t);
 #ifdef FIXED_POINT
    if (pitch_coef>63)
       pitch_coef=63;
@@ -734,9 +800,11 @@ spx_word32_t *cumul_gain
    {
       exc[i]=MULT16_32_Q15(SHL16(pitch_coef, 9),exc[i-start]);
    }
-   syn_percep_zero(exc, ak, awk1, awk2, res, nsf, p, stack);
    for (i=0;i<nsf;i++)
-      target[i]=EXTRACT16(SATURATE(SUB32(EXTEND32(target[i]),PSHR32(res[i],SIG_SHIFT-1)),32700));
+      res[i] = EXTRACT16(PSHR32(exc[i], SIG_SHIFT-1));
+   syn_percep_zero16(res, ak, awk1, awk2, res, nsf, p, stack);
+   for (i=0;i<nsf;i++)
+      target[i]=EXTRACT16(SATURATE(SUB32(EXTEND32(target[i]),EXTEND32(res[i])),32700));
    return start;
 }
 
@@ -770,7 +838,7 @@ int cdbk_offset
    for (i=0;i<nsf;i++)
    {
       exc_out[i]=MULT16_16(exc[i-start],SHL16(pitch_coef,7));
-      exc[i] = PSHR(exc_out[i],13);
+      exc[i] = EXTRACT16(PSHR32(exc_out[i],13));
    }
    *pitch_val = start;
    gain_val[0]=gain_val[2]=0;
diff --git a/libspeex/ltp_arm4.h b/libspeex/ltp_arm4.h
index 7479e8b..cdb94e6 100644
--- a/libspeex/ltp_arm4.h
+++ b/libspeex/ltp_arm4.h
@@ -75,9 +75,10 @@ spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
          "\tadd %2, %2, %7, asr #5\n"
          "\tadd %3, %3, %10, asr #5\n"
          "\tbne .inner_prod_loop%=\n"
-   : "=r" (deadx), "=r" (deady), "=r" (sum1),  "=r" (sum2), "=r" (deadlen),
-   "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4), "=r" (dead5), "=r" (dead6)
-   : "0" (x), "1" (y), "2" (sum1), "3" (sum2), "4" (len>>3)
+   : "=r" (deadx), "=r" (deady), "+r" (sum1),  "+r" (sum2),
+     "=r" (deadlen), "=r" (dead1), "=r" (dead2), "=r" (dead3),
+     "=r" (dead4), "=r" (dead5), "=r" (dead6)
+   : "0" (x), "1" (y), "4" (len>>3)
    : "cc"
                         );
    return (sum1+sum2)>>1;
@@ -169,13 +170,11 @@ void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *c
                "\tstr %6, %13 \n"
                "\tstr %7, %14 \n"
 
-            : "=r" (y0), "=r" (y1), "=r" (y2), "=r" (y3),
+            : "+r" (y0), "+r" (y1), "+r" (y2), "+r" (y3),
          "=r" (part1),  "=r" (part2),  "=r" (part3),  "=r" (part4),
-         "=r" (x), "=r" (y), "=r" (x0),
-         "=m" (sum1), "=m" (sum2), "=m" (sum3), "=m" (sum4), "=r" (dead1)
-            : "0" (y0), "1" (y1), "2" (y2), "3" (y3),
-            "8" (x), "9" (y),
-            "11" (sum1), "12" (sum2), "13" (sum3), "14" (sum4)
+              "+r" (x), "+r" (y), "=r" (x0), "+m" (sum1),
+              "+m" (sum2), "+m" (sum3), "+m" (sum4), "=r" (dead1)
+            :
             : "cc", "memory"
                               );
       }
diff --git a/libspeex/ltp_bfin.h b/libspeex/ltp_bfin.h
index c466902..b530f85 100644
--- a/libspeex/ltp_bfin.h
+++ b/libspeex/ltp_bfin.h
@@ -330,7 +330,6 @@ static int pitch_gain_search_3tap_vq(
 "        %0 = 0;\n\t"                      /* %0: best_sum         */
 "        %1 = 0;\n\t"                      /* %1: best_cbdk        */
 "        P1 = 0;\n\t"                      /* P1: loop counter     */
-"        R5 = 64;\n\t"                     /* R5: pitch_control    */
 
 "        LSETUP (pgs1, pgs2) LC1 = %4;\n\t"
 "pgs1:     R2  = B [P0++] (X);\n\t"        /* R2: g[0]             */
@@ -339,6 +338,7 @@ static int pitch_gain_search_3tap_vq(
 "          R2 += 32;\n\t"
 "          R3 += 32;\n\t"
 "          R4 += 32;\n\t"
+"          R4.H = 64;\n\t"                 /* R4.H: pitch_control    */
 
 "          R0  = B [P0++] (X);\n\t"              
 "          B0  = R0;\n\t"                  /* BO: gain_sum         */
@@ -349,13 +349,13 @@ static int pitch_gain_search_3tap_vq(
 "          A0 = 0;\n\t"
          
 "          R0.L = W[I1++];\n\t"
-"          R1.L = R2.L*R5.L (IS);\n\t"
+"          R1.L = R2.L*R4.H (IS);\n\t"
 "          A0 += R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
          
-"          R1.L = R3.L*R5.L (IS);\n\t"
+"          R1.L = R3.L*R4.H (IS);\n\t"
 "          A0 += R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
          
-"          R1.L = R4.L*R5.L (IS);\n\t"
+"          R1.L = R4.L*R4.H (IS);\n\t"
 "          A0 += R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
          
 "          R1.L = R2.L*R3.L (IS);\n\t"
@@ -406,7 +406,7 @@ static int pitch_gain_search_3tap_vq(
        : "=&d" (best_sum), "=&d" (best_cdbk) 
        : "a" (gain_cdbk), "a" (C16), "a" (gain_cdbk_size), "a" (max_gain),
          "b" (-VERY_LARGE32)
-       : "R0", "R1", "R2", "R3", "R4", "R5", "P0", 
+       : "R0", "R1", "R2", "R3", "R4", "P0", 
          "P1", "I1", "L1", "A0", "B0"
 #if (__GNUC__ == 4)
          , "LC1"
diff --git a/libspeex/math_approx.c b/libspeex/math_approx.c
index d98e05b..21af766 100644
--- a/libspeex/math_approx.c
+++ b/libspeex/math_approx.c
@@ -37,67 +37,83 @@
 #include "math_approx.h"
 #include "misc.h"
 
-#ifdef FIXED_POINT
-
-/* sqrt(x) ~= 0.22178 + 1.29227*x - 0.77070*x^2 + 0.25723*x^3 (for .25 < x < 1) */
-#define C0 3634
-#define C1 21173
-#define C2 -12627
-#define C3 4215
-
-spx_word16_t spx_sqrt(spx_word32_t x)
+spx_int16_t spx_ilog2(spx_uint32_t x)
 {
-   int k=0;
-   spx_word32_t rt;
-
-   if (x<=0)
-      return 0;
-#if 1
-   if (x>=16777216)
+   int r=0;
+   if (x>=(spx_int32_t)65536)
    {
-      x>>=10;
-      k+=5;
+      x >>= 16;
+      r += 16;
    }
-   if (x>=1048576)
+   if (x>=256)
    {
-      x>>=6;
-      k+=3;
+      x >>= 8;
+      r += 8;
    }
-   if (x>=262144)
+   if (x>=16)
    {
-      x>>=4;
-      k+=2;
+      x >>= 4;
+      r += 4;
    }
-   if (x>=32768)
+   if (x>=4)
    {
-      x>>=2;
-      k+=1;
+      x >>= 2;
+      r += 2;
    }
-   if (x>=16384)
+   if (x>=2)
    {
-      x>>=2;
-      k+=1;
+      r += 1;
    }
-#else
-   while (x>=16384)
+   return r;
+}
+
+spx_int16_t spx_ilog4(spx_uint32_t x)
+{
+   int r=0;
+   if (x>=(spx_int32_t)65536)
    {
-      x>>=2;
-      k++;
-      }
-#endif
-   while (x<4096)
+      x >>= 16;
+      r += 8;
+   }
+   if (x>=256)
    {
-      x<<=2;
-      k--;
+      x >>= 8;
+      r += 4;
    }
+   if (x>=16)
+   {
+      x >>= 4;
+      r += 2;
+   }
+   if (x>=4)
+   {
+      r += 1;
+   }
+   return r;
+}
+
+#ifdef FIXED_POINT
+
+/* sqrt(x) ~= 0.22178 + 1.29227*x - 0.77070*x^2 + 0.25723*x^3 (for .25 < x < 1) */
+/*#define C0 3634
+#define C1 21173
+#define C2 -12627
+#define C3 4215*/
+
+/* sqrt(x) ~= 0.22178 + 1.29227*x - 0.77070*x^2 + 0.25659*x^3 (for .25 < x < 1) */
+#define C0 3634
+#define C1 21173
+#define C2 -12627
+#define C3 4204
+
+spx_word16_t spx_sqrt(spx_word32_t x)
+{
+   int k;
+   spx_word32_t rt;
+   k = spx_ilog4(x)-6;
+   x = VSHR32(x, (k<<1));
    rt = ADD16(C0, MULT16_16_Q14(x, ADD16(C1, MULT16_16_Q14(x, ADD16(C2, MULT16_16_Q14(x, (C3)))))));
-   if (rt > 16383)
-      rt = 16383;
-   if (k>0)
-      rt <<= k;
-   else
-      rt >>= -k;
-   rt >>=7;
+   rt = VSHR32(rt,7-k);
    return rt;
 }
 
@@ -151,6 +167,101 @@ spx_word16_t spx_cos(spx_word16_t x)
    }
 }
 
+#define L1 32767
+#define L2 -7651
+#define L3 8277
+#define L4 -626
+
+static inline spx_word16_t _spx_cos_pi_2(spx_word16_t x)
+{
+   spx_word16_t x2;
+   
+   x2 = MULT16_16_P15(x,x);
+   return ADD16(1,MIN16(32766,ADD32(SUB16(L1,x2), MULT16_16_P15(x2, ADD32(L2, MULT16_16_P15(x2, ADD32(L3, MULT16_16_P15(L4, x2))))))));
+}
+
+spx_word16_t spx_cos_norm(spx_word32_t x)
+{
+   x = x&0x0001ffff;
+   if (x>SHL32(EXTEND32(1), 16))
+      x = SUB32(SHL32(EXTEND32(1), 17),x);
+   if (x&0x00007fff)
+   {
+      if (x<SHL32(EXTEND32(1), 15))
+      {
+         return _spx_cos_pi_2(EXTRACT16(x));
+      } else {
+         return NEG32(_spx_cos_pi_2(EXTRACT16(65536-x)));
+      }
+   } else {
+      if (x&0x0000ffff)
+         return 0;
+      else if (x&0x0001ffff)
+         return -32767;
+      else
+         return 32767;
+   }
+}
+
+/*
+ K0 = 1
+ K1 = log(2)
+ K2 = 3-4*log(2)
+ K3 = 3*log(2) - 2
+*/
+#define D0 16384
+#define D1 11356
+#define D2 3726
+#define D3 1301
+/* Input in Q11 format, output in Q16 */
+static spx_word32_t spx_exp2(spx_word16_t x)
+{
+   int integer;
+   spx_word16_t frac;
+   integer = SHR16(x,11);
+   if (integer>14)
+      return 0x7fffffff;
+   else if (integer < -15)
+      return 0;
+   frac = SHL16(x-SHL16(integer,11),3);
+   frac = ADD16(D0, MULT16_16_Q14(frac, ADD16(D1, MULT16_16_Q14(frac, ADD16(D2 , MULT16_16_Q14(D3,frac))))));
+   return VSHR32(EXTEND32(frac), -integer-2);
+}
+
+/* Input in Q11 format, output in Q16 */
+spx_word32_t spx_exp(spx_word16_t x)
+{
+   if (x>21290)
+      return 0x7fffffff;
+   else if (x<-21290)
+      return 0;
+   else
+      return spx_exp2(MULT16_16_P14(23637,x));
+}
+#define M1 32767
+#define M2 -21
+#define M3 -11943
+#define M4 4936
+
+static inline spx_word16_t spx_atan01(spx_word16_t x)
+{
+   return MULT16_16_P15(x, ADD32(M1, MULT16_16_P15(x, ADD32(M2, MULT16_16_P15(x, ADD32(M3, MULT16_16_P15(M4, x)))))));
+}
+
+/* Input in Q15, output in Q14 */
+spx_word16_t spx_atan(spx_word32_t x)
+{
+   if (x <= 32767)
+   {
+      return SHR16(spx_atan01(x),1);
+   } else {
+      int e = spx_ilog2(x);
+      if (e>=29)
+         return 25736;
+      x = DIV32_16(SHL32(EXTEND32(32767),29-e), EXTRACT16(SHR32(x, e-14)));
+      return SUB16(25736, SHR16(spx_atan01(x),1));
+   }
+}
 #else
 
 #ifndef M_PI
@@ -177,5 +288,4 @@ spx_word16_t spx_cos(spx_word16_t x)
    }
 }
 
-
 #endif
diff --git a/libspeex/math_approx.h b/libspeex/math_approx.h
index 377bf1a..49cfda6 100644
--- a/libspeex/math_approx.h
+++ b/libspeex/math_approx.h
@@ -38,13 +38,25 @@
 #include "misc.h"
 
 spx_word16_t spx_cos(spx_word16_t x);
-
+spx_int16_t spx_ilog2(spx_uint32_t x);
+spx_int16_t spx_ilog4(spx_uint32_t x);
 #ifdef FIXED_POINT
 spx_word16_t spx_sqrt(spx_word32_t x);
 spx_word16_t spx_acos(spx_word16_t x);
+spx_word32_t spx_exp(spx_word16_t x);
+spx_word16_t spx_cos_norm(spx_word32_t x);
+
+/* Input in Q15, output in Q14 */
+spx_word16_t spx_atan(spx_word32_t x);
+
 #else
+
 #define spx_sqrt sqrt
 #define spx_acos acos
+#define spx_exp exp
+#define spx_cos_norm(x) (cos((.5f*M_PI)*(x)))
+#define spx_atan atan
+
 #endif
 
 #endif
diff --git a/libspeex/mdf.c b/libspeex/mdf.c
index 3d79383..014ea25 100644
--- a/libspeex/mdf.c
+++ b/libspeex/mdf.c
@@ -41,8 +41,8 @@
    double-talk is achieved using a variable learning rate as described in:
    
    Valin, J.-M., On Adjusting the Learning Rate in Frequency Domain Echo 
-   Cancellation With Double-Talk. To appear in IEEE Transactions on Audio,
-   Speech and Language Processing, 2006.
+   Cancellation With Double-Talk. IEEE Transactions on Audio,
+   Speech and Language Processing, Vol. 15, No. 3, pp. 1030-1034, 2007.
    http://people.xiph.org/~jm/papers/valin_taslp2006.pdf
    
    There is no explicit double-talk detection, but a continuous variation
@@ -79,9 +79,6 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-#define min(a,b) ((a)<(b) ? (a) : (b))
-#define max(a,b) ((a)>(b) ? (a) : (b))
-
 #ifdef FIXED_POINT
 #define WEIGHT_SHIFT 11
 #define NORMALIZE_SCALEDOWN 5
@@ -93,16 +90,40 @@
 /* If enabled, the transition between blocks is smooth, so there isn't any blocking
 aftifact when adapting. The cost is an extra FFT and a matrix-vector multiply */
 #define SMOOTH_BLOCKS
+/* If enabled, the AEC will use a foreground filter and a background filter to be more robust to double-talk
+   and difficult signals in general. The cost is an extra FFT and a matrix-vector multiply */
+#define TWO_PATH
 
 #ifdef FIXED_POINT
-static const spx_float_t MIN_LEAK = {16777, -19};
+static const spx_float_t MIN_LEAK = {20972, -22};
+
+/* Constants for the two-path filter */
+static const spx_float_t VAR1_SMOOTH = {23593, -16};
+static const spx_float_t VAR2_SMOOTH = {23675, -15};
+static const spx_float_t VAR1_UPDATE = {16384, -15};
+static const spx_float_t VAR2_UPDATE = {16384, -16};
+static const spx_float_t VAR_BACKTRACK = {16384, -12};
 #define TOP16(x) ((x)>>16)
+
 #else
-static const spx_float_t MIN_LEAK = .032f;
+
+static const spx_float_t MIN_LEAK = .0032f;
+
+/* Constants for the two-path filter */
+static const spx_float_t VAR1_SMOOTH = .36f;
+static const spx_float_t VAR2_SMOOTH = .7225f;
+static const spx_float_t VAR1_UPDATE = .5f;
+static const spx_float_t VAR2_UPDATE = .25f;
+static const spx_float_t VAR_BACKTRACK = 4.f;
 #define TOP16(x) (x)
 #endif
 
 
+#define PLAYBACK_DELAY 2
+
+void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
+
+
 /** Speex echo cancellation state. */
 struct SpeexEchoState_ {
    int frame_size;           /**< Number of samples processed each time */
@@ -111,6 +132,7 @@ struct SpeexEchoState_ {
    int cancel_count;
    int adapted;
    int saturated;
+   int screwed_up;
    int C;                    /** Number of input channels (microphones) */
    int K;                    /** Number of output channels (loudspeakers) */
    spx_int32_t sampling_rate;
@@ -118,30 +140,38 @@ struct SpeexEchoState_ {
    spx_word16_t beta0;
    spx_word16_t beta_max;
    spx_word32_t sum_adapt;
-   spx_word16_t *e;
-   spx_word16_t *x;
-   spx_word16_t *X;
-   spx_word16_t *d;
-   spx_word16_t *y;
+   spx_word16_t leak_estimate;
+   
+   spx_word16_t *e;      /* scratch */
+   spx_word16_t *x;      /* Far-end input buffer (2N) */
+   spx_word16_t *X;      /* Far-end buffer (M+1 frames) in frequency domain */
+   spx_word16_t *input;  /* scratch */
+   spx_word16_t *y;      /* scratch */
    spx_word16_t *last_y;
-   spx_word32_t *Yps;
-   spx_word16_t *Y;
+   spx_word16_t *Y;      /* scratch */
    spx_word16_t *E;
-   spx_word32_t *PHI;
-   spx_word32_t *W;
-   spx_word32_t *power;
-   spx_float_t *power_1;
-   spx_word16_t *wtmp;
+   spx_word32_t *PHI;    /* scratch */
+   spx_word32_t *W;      /* (Background) filter weights */
+#ifdef TWO_PATH
+   spx_word16_t *foreground; /* Foreground filter weights */
+   spx_word32_t  Davg1;  /* 1st recursive average of the residual power difference */
+   spx_word32_t  Davg2;  /* 2nd recursive average of the residual power difference */
+   spx_float_t   Dvar1;  /* Estimated variance of 1st estimator */
+   spx_float_t   Dvar2;  /* Estimated variance of 2nd estimator */
+#endif
+   spx_word32_t *power;  /* Power of the far-end signal */
+   spx_float_t  *power_1;/* Inverse power of far-end */
+   spx_word16_t *wtmp;   /* scratch */
 #ifdef FIXED_POINT
-   spx_word16_t *wtmp2;
+   spx_word16_t *wtmp2;  /* scratch */
 #endif
-   spx_word32_t *Rf;
-   spx_word32_t *Yf;
-   spx_word32_t *Xf;
+   spx_word32_t *Rf;     /* scratch */
+   spx_word32_t *Yf;     /* scratch */
+   spx_word32_t *Xf;     /* scratch */
    spx_word32_t *Eh;
    spx_word32_t *Yh;
-   spx_float_t Pey;
-   spx_float_t Pyy;
+   spx_float_t   Pey;
+   spx_float_t   Pyy;
    spx_word16_t *window;
    spx_word16_t *prop;
    void *fft_table;
@@ -153,6 +183,7 @@ struct SpeexEchoState_ {
    /* NOTE: If you only use speex_echo_cancel() and want to save some memory, remove this */
    spx_int16_t *play_buf;
    int play_buf_pos;
+   int play_buf_started;
 };
 
 static inline void filter_dc_notch16(const spx_int16_t *in, spx_word16_t radius, spx_word16_t *out, int len, spx_mem_t *mem, int stride)
@@ -179,6 +210,7 @@ static inline void filter_dc_notch16(const spx_int16_t *in, spx_word16_t radius,
    }
 }
 
+/* This inner product is slightly different from the codec version because of fixed-point */
 static inline spx_word32_t mdf_inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
    spx_word32_t sum=0;
@@ -247,6 +279,34 @@ static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t
    }
    acc[N-1] = PSHR32(tmp1,WEIGHT_SHIFT);
 }
+static inline void spectral_mul_accum16(const spx_word16_t *X, const spx_word16_t *Y, spx_word16_t *acc, int N, int M)
+{
+   int i,j;
+   spx_word32_t tmp1=0,tmp2=0;
+   for (j=0;j<M;j++)
+   {
+      tmp1 = MAC16_16(tmp1, X[j*N],Y[j*N]);
+   }
+   acc[0] = PSHR32(tmp1,WEIGHT_SHIFT);
+   for (i=1;i<N-1;i+=2)
+   {
+      tmp1 = tmp2 = 0;
+      for (j=0;j<M;j++)
+      {
+         tmp1 = SUB32(MAC16_16(tmp1, X[j*N+i],Y[j*N+i]), MULT16_16(X[j*N+i+1],Y[j*N+i+1]));
+         tmp2 = MAC16_16(MAC16_16(tmp2, X[j*N+i+1],Y[j*N+i]), X[j*N+i], Y[j*N+i+1]);
+      }
+      acc[i] = PSHR32(tmp1,WEIGHT_SHIFT);
+      acc[i+1] = PSHR32(tmp2,WEIGHT_SHIFT);
+   }
+   tmp1 = tmp2 = 0;
+   for (j=0;j<M;j++)
+   {
+      tmp1 = MAC16_16(tmp1, X[(j+1)*N-1],Y[(j+1)*N-1]);
+   }
+   acc[N-1] = PSHR32(tmp1,WEIGHT_SHIFT);
+}
+
 #else
 static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t *Y, spx_word16_t *acc, int N, int M)
 {
@@ -266,21 +326,73 @@ static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t
       Y += N;
    }
 }
+#define spectral_mul_accum16 spectral_mul_accum
 #endif
 
 /** Compute weighted cross-power spectrum of a half-complex (packed) vector with conjugate */
-static inline void weighted_spectral_mul_conj(const spx_float_t *w, const spx_word16_t *X, const spx_word16_t *Y, spx_word32_t *prod, int N)
+static inline void weighted_spectral_mul_conj(const spx_float_t *w, const spx_float_t p, const spx_word16_t *X, const spx_word16_t *Y, spx_word32_t *prod, int N)
 {
    int i, j;
-   prod[0] = FLOAT_MUL32(w[0],MULT16_16(X[0],Y[0]));
+   spx_float_t W;
+   W = FLOAT_AMULT(p, w[0]);
+   prod[0] = FLOAT_MUL32(W,MULT16_16(X[0],Y[0]));
    for (i=1,j=1;i<N-1;i+=2,j++)
    {
-      prod[i] = FLOAT_MUL32(w[j],MAC16_16(MULT16_16(X[i],Y[i]), X[i+1],Y[i+1]));
-      prod[i+1] = FLOAT_MUL32(w[j],MAC16_16(MULT16_16(-X[i+1],Y[i]), X[i],Y[i+1]));
+      W = FLOAT_AMULT(p, w[j]);
+      prod[i] = FLOAT_MUL32(W,MAC16_16(MULT16_16(X[i],Y[i]), X[i+1],Y[i+1]));
+      prod[i+1] = FLOAT_MUL32(W,MAC16_16(MULT16_16(-X[i+1],Y[i]), X[i],Y[i+1]));
+   }
+   W = FLOAT_AMULT(p, w[j]);
+   prod[i] = FLOAT_MUL32(W,MULT16_16(X[i],Y[i]));
+}
+
+static inline void mdf_adjust_prop(const spx_word32_t *W, int N, int M, int P, spx_word16_t *prop)
+{
+   int i, j, p;
+   spx_word16_t max_sum = 1;
+   spx_word32_t prop_sum = 1;
+   for (i=0;i<M;i++)
+   {
+      spx_word32_t tmp = 1;
+      for (p=0;p<P;p++)
+         for (j=0;j<N;j++)
+            tmp += MULT16_16(EXTRACT16(SHR32(W[p*N*M + i*N+j],18)), EXTRACT16(SHR32(W[p*N*M + i*N+j],18)));
+#ifdef FIXED_POINT
+      /* Just a security in case an overflow were to occur */
+      tmp = MIN32(ABS32(tmp), 536870912);
+#endif
+      prop[i] = spx_sqrt(tmp);
+      if (prop[i] > max_sum)
+         max_sum = prop[i];
+   }
+   for (i=0;i<M;i++)
+   {
+      prop[i] += MULT16_16_Q15(QCONST16(.03f,15),max_sum);
+      prop_sum += EXTEND32(prop[i]);
+   }
+   for (i=0;i<M;i++)
+   {
+      prop[i] = DIV32(MULT16_16(QCONST16(.99f,15), prop[i]),prop_sum);
+      /*printf ("%f ", prop[i]);*/
    }
-   prod[i] = FLOAT_MUL32(w[j],MULT16_16(X[i],Y[i]));
+   /*printf ("\n");*/
 }
 
+#ifdef DUMP_ECHO_CANCEL_DATA
+#include <stdio.h>
+static FILE *rFile=NULL, *pFile=NULL, *oFile=NULL;
+
+static void dump_audio(const spx_int16_t *rec, const spx_int16_t *play, const spx_int16_t *out, int len)
+{
+   if (!(rFile && pFile && oFile))
+   {
+      speex_error("Dump files not open");
+   }
+   fwrite(rec, sizeof(spx_int16_t), len, rFile);
+   fwrite(play, sizeof(spx_int16_t), len, pFile);
+   fwrite(out, sizeof(spx_int16_t), len, oFile);
+}
+#endif
 
 /** Creates a new echo canceller state */
 SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic, int nb_speakers)
@@ -292,6 +404,14 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
    st->C = nb_mic;
    C=st->C;
    K=st->K;
+#ifdef DUMP_ECHO_CANCEL_DATA
+   if (rFile || pFile || oFile)
+      speex_error("Opening dump files twice");
+   rFile = fopen("aec_rec.sw", "w");
+   pFile = fopen("aec_play.sw", "w");
+   oFile = fopen("aec_out.sw", "w");
+#endif
+   
    st->frame_size = frame_size;
    st->window_size = 2*frame_size;
    N = st->window_size;
@@ -299,7 +419,8 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
    st->cancel_count=0;
    st->sum_adapt = 0;
    st->saturated = 0;
-   /* FIXME: Make that an init option (new API call?) */
+   st->screwed_up = 0;
+   /* This is the default sampling rate */
    st->sampling_rate = 8000;
    st->spec_average = DIV32_16(SHL32(EXTEND32(st->frame_size), 15), st->sampling_rate);
 #ifdef FIXED_POINT
@@ -309,14 +430,14 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
    st->beta0 = (2.0f*st->frame_size)/st->sampling_rate;
    st->beta_max = (.5f*st->frame_size)/st->sampling_rate;
 #endif
+   st->leak_estimate = 0;
 
    st->fft_table = spx_fft_init(N);
    
    st->e = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
    st->x = (spx_word16_t*)speex_alloc(K*N*sizeof(spx_word16_t));
-   st->d = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
+   st->input = (spx_word16_t*)speex_alloc(C*st->frame_size*sizeof(spx_word16_t));
    st->y = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
-   st->Yps = (spx_word32_t*)speex_alloc(C*N*sizeof(spx_word32_t));
    st->last_y = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
    st->Yf = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
    st->Rf = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
@@ -328,6 +449,9 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
    st->Y = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
    st->E = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
    st->W = (spx_word32_t*)speex_alloc(C*K*M*N*sizeof(spx_word32_t));
+#ifdef TWO_PATH
+   st->foreground = (spx_word16_t*)speex_alloc(M*N*C*K*sizeof(spx_word16_t));
+#endif
    st->PHI = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
    st->power = (spx_word32_t*)speex_alloc((frame_size+1)*sizeof(spx_word32_t));
    st->power_1 = (spx_float_t*)speex_alloc((frame_size+1)*sizeof(spx_float_t));
@@ -349,12 +473,10 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
       st->power_1[i] = FLOAT_ONE;
    for (i=0;i<N*M*K*C;i++)
       st->W[i] = 0;
-   for (i=0;i<N;i++)
-      st->PHI[i] = 0;
    {
       spx_word32_t sum = 0;
       /* Ratio of ~10 between adaptation rate of first and last block */
-      spx_word16_t decay = QCONST16(exp(-2.4/M),15);
+      spx_word16_t decay = SHR32(spx_exp(NEG16(DIV32_16(QCONST16(2.4,11),M))),1);
       st->prop[0] = QCONST16(.7, 15);
       sum = EXTEND32(st->prop[0]);
       for (i=1;i<M;i++)
@@ -364,7 +486,7 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
       }
       for (i=M-1;i>=0;i--)
       {
-         st->prop[i] = DIV32(SHL32(EXTEND32(st->prop[i]),15),sum);
+         st->prop[i] = DIV32(MULT16_16(QCONST16(.99f,15), st->prop[i]),sum);
       }
    }
    
@@ -383,9 +505,15 @@ SpeexEchoState *mc_echo_state_init(int frame_size, int filter_length, int nb_mic
    st->adapted = 0;
    st->Pey = st->Pyy = FLOAT_ONE;
    
-   st->play_buf = (spx_int16_t*)speex_alloc(K*2*st->frame_size*sizeof(spx_int16_t));
-   st->play_buf_pos = 0;
-
+#ifdef TWO_PATH
+   st->Davg1 = st->Davg2 = 0;
+   st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
+#endif
+   
+   st->play_buf = (spx_int16_t*)speex_alloc(K*(PLAYBACK_DELAY+1)*st->frame_size*sizeof(spx_int16_t));
+   st->play_buf_pos = PLAYBACK_DELAY*st->frame_size;
+   st->play_buf_started = 0;
+   
    return st;
 }
 
@@ -394,26 +522,57 @@ void speex_echo_state_reset(SpeexEchoState *st)
 {
    int i, M, N, C, K;
    st->cancel_count=0;
+   st->screwed_up = 0;
    N = st->window_size;
    M = st->M;
    C=st->C;
    K=st->K;
    for (i=0;i<N*M;i++)
       st->W[i] = 0;
+#ifdef TWO_PATH
+   for (i=0;i<N*M;i++)
+      st->foreground[i] = 0;
+#endif
    for (i=0;i<N*(M+1);i++)
       st->X[i] = 0;
    for (i=0;i<=st->frame_size;i++)
+   {
       st->power[i] = 0;
-   for (i=0;i<N;i++)
+      st->power_1[i] = FLOAT_ONE;
+      st->Eh[i] = 0;
+      st->Yh[i] = 0;
+   }
+   for (i=0;i<st->frame_size;i++)
+   {
+      st->last_y[i] = 0;
+   }
+   for (i=0;i<N*C;i++)
+   {
       st->E[i] = 0;
+   }
+   for (i=0;i<N*K;i++)
+   {
+      st->x[i] = 0;
+   }
    for (i=0;i<2*C;i++)
       st->notch_mem[i] = 0;
-  
+   for (i=0;i<C;i++)
+      st->memD[i]=st->memE[i]=0;
+   for (i=0;i<K;i++)
+      st->memX[i]=0;
+
    st->saturated = 0;
    st->adapted = 0;
    st->sum_adapt = 0;
    st->Pey = st->Pyy = FLOAT_ONE;
-   st->play_buf_pos = 0;
+#ifdef TWO_PATH
+   st->Davg1 = st->Davg2 = 0;
+   st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
+#endif
+   for (i=0;i<3*st->frame_size;i++)
+      st->play_buf[i] = 0;
+   st->play_buf_pos = PLAYBACK_DELAY*st->frame_size;
+   st->play_buf_started = 0;
 
 }
 
@@ -424,10 +583,9 @@ void mc_echo_state_destroy(SpeexEchoState *st)
 
    speex_free(st->e);
    speex_free(st->x);
-   speex_free(st->d);
+   speex_free(st->input);
    speex_free(st->y);
    speex_free(st->last_y);
-   speex_free(st->Yps);
    speex_free(st->Yf);
    speex_free(st->Rf);
    speex_free(st->Xf);
@@ -438,6 +596,9 @@ void mc_echo_state_destroy(SpeexEchoState *st)
    speex_free(st->Y);
    speex_free(st->E);
    speex_free(st->W);
+#ifdef TWO_PATH
+   speex_free(st->foreground);
+#endif
    speex_free(st->PHI);
    speex_free(st->power);
    speex_free(st->power_1);
@@ -449,19 +610,29 @@ void mc_echo_state_destroy(SpeexEchoState *st)
 #endif
    speex_free(st->play_buf);
    speex_free(st);
+   
+#ifdef DUMP_ECHO_CANCEL_DATA
+   fclose(rFile);
+   fclose(pFile);
+   fclose(oFile);
+   rFile = pFile = oFile = NULL;
+#endif
 }
 
-void mc_echo_capture(SpeexEchoState *st, const spx_int16_t *rec, spx_int16_t *out, spx_int32_t *Yout)
+
+void mc_echo_capture2(SpeexEchoState *st, const spx_int16_t *rec, spx_int16_t *out)
 {
    int i;
+   /*speex_warning_int("capture with fill level ", st->play_buf_pos/st->frame_size);*/
+   st->play_buf_started = 1;
    if (st->play_buf_pos>=st->frame_size)
    {
-      mc_echo_cancel(st, rec, st->play_buf, out, Yout);
+      mc_echo_cancellation(st, rec, st->play_buf, out);
       st->play_buf_pos -= st->frame_size;
-      for (i=0;i<st->frame_size;i++)
+      for (i=0;i<st->play_buf_pos;i++)
          st->play_buf[i] = st->play_buf[i+st->frame_size];
    } else {
-      speex_warning("no playback frame available");
+      speex_warning("No playback frame available (your application is buggy and/or got xruns)");
       if (st->play_buf_pos!=0)
       {
          speex_warning("internal playback buffer corruption?");
@@ -474,23 +645,47 @@ void mc_echo_capture(SpeexEchoState *st, const spx_int16_t *rec, spx_int16_t *ou
 
 void mc_echo_playback(SpeexEchoState *st, const spx_int16_t *play)
 {
-   if (st->play_buf_pos<=st->frame_size)
+   /*speex_warning_int("playback with fill level ", st->play_buf_pos/st->frame_size);*/
+   if (!st->play_buf_started)
+   {
+      speex_warning("discarded first playback frame");
+      return;
+   }
+   if (st->play_buf_pos<=PLAYBACK_DELAY*st->frame_size)
    {
       int i;
       for (i=0;i<st->frame_size;i++)
          st->play_buf[st->play_buf_pos+i] = play[i];
       st->play_buf_pos += st->frame_size;
+      if (st->play_buf_pos <= (PLAYBACK_DELAY-1)*st->frame_size)
+      {
+         speex_warning("Auto-filling the buffer (your application is buggy and/or got xruns)");
+         for (i=0;i<st->frame_size;i++)
+            st->play_buf[st->play_buf_pos+i] = play[i];
+         st->play_buf_pos += st->frame_size;
+      }
    } else {
-      speex_warning("had to discard a playback frame");
+      speex_warning("Had to discard a playback frame (your application is buggy and/or got xruns)");
    }
 }
 
 /** Performs echo cancellation on a frame */
-void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_t *echo, spx_int16_t *out, spx_int32_t *Yout)
+void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *in, const spx_int16_t *far_end, spx_int16_t *out, spx_int32_t *Yout)
+{
+   mc_echo_cancellation(st, in, far_end, out);
+}
+
+/** Performs echo cancellation on a frame (deprecated, last arg now ignored) */
+void mc_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, const spx_int16_t *far_end, spx_int16_t *out)
 {
    int i,j, chan, speak;
    int N,M, C, K;
-   spx_word16_t leak_estimate;
+   spx_word32_t Syy,See,Sxx,Sdd, Sff;
+#ifdef TWO_PATH
+   spx_word32_t Dbf;
+   int update_foreground;
+#endif
+   spx_word32_t Sey;
    spx_word16_t ss, ss_1;
    spx_float_t Pey = FLOAT_ONE, Pyy=FLOAT_ONE;
    spx_float_t alpha, alpha_1;
@@ -501,7 +696,6 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
    M = st->M;
    C = st->C;
    K = st->K;
-   spx_word32_t Syy=0,See=0,Sxx=0;
 
    st->cancel_count++;
 #ifdef FIXED_POINT
@@ -514,29 +708,31 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
 
    for (chan = 0; chan < C; chan++)
    {
-      filter_dc_notch16(ref+chan, st->notch_radius, st->d+chan*N, st->frame_size, st->notch_mem+2*chan, C);
+      /* Apply a notch filter to make sure DC doesn't end up causing problems */
+      filter_dc_notch16(in+chan, st->notch_radius, st->input+chan*st->frame_size, st->frame_size, st->notch_mem+2*chan, C);
+      /* Copy input data to buffer and apply pre-emphasis */
       /* Copy input data to buffer */
       for (i=0;i<st->frame_size;i++)
       {
-         spx_word16_t tmp;
          spx_word32_t tmp32;
-         tmp = st->d[chan*N+i];
-         st->d[chan*N+i] = st->d[chan*N+i+st->frame_size];
-         tmp32 = SUB32(EXTEND32(tmp), EXTEND32(MULT16_16_P15(st->preemph, st->memD[chan])));
+         /* FIXME: This core has changed a bit, need to merge properly */
+         tmp32 = SUB32(EXTEND32(st->input[chan*st->frame_size+i]), EXTEND32(MULT16_16_P15(st->preemph, st->memD[chan])));
 #ifdef FIXED_POINT
          if (tmp32 > 32767)
          {
             tmp32 = 32767;
-            st->saturated = 1;
+            if (st->saturated == 0)
+               st->saturated = 1;
          }      
          if (tmp32 < -32767)
          {
             tmp32 = -32767;
-            st->saturated = 1;
+            if (st->saturated == 0)
+               st->saturated = 1;
          }
 #endif
-         st->d[chan*N+i+st->frame_size] = tmp32;
-         st->memD[chan] = tmp;
+         st->memD[chan] = st->input[chan*st->frame_size+i];
+         st->input[chan*st->frame_size+i] = EXTRACT16(tmp32);
       }
    }
 
@@ -544,25 +740,24 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
    {
       for (i=0;i<st->frame_size;i++)
       {
-         spx_word16_t tmp;
          spx_word32_t tmp32;
          st->x[speak*N+i] = st->x[speak*N+i+st->frame_size];
-         tmp32 = SUB32(EXTEND32(echo[i*K+speak]), EXTEND32(MULT16_16_P15(st->preemph, st->memX[speak])));
+         tmp32 = SUB32(EXTEND32(far_end[i*K+speak]), EXTEND32(MULT16_16_P15(st->preemph, st->memX[speak])));
 #ifdef FIXED_POINT
          /*FIXME: If saturation occurs here, we need to freeze adaptation for M frames (not just one) */
          if (tmp32 > 32767)
          {
             tmp32 = 32767;
-            st->saturated = 1;
+            st->saturated = M+1;
          }      
          if (tmp32 < -32767)
          {
             tmp32 = -32767;
-            st->saturated = 1;
+            st->saturated = M+1;
          }      
 #endif
          st->x[speak*N+i+st->frame_size] = EXTRACT16(tmp32);
-         st->memX[speak] = echo[i*K+speak];
+         st->memX[speak] = far_end[i*K+speak];
       }
    }   
    
@@ -578,16 +773,32 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
       spx_fft(st->fft_table, st->x+speak*N, &st->X[speak*N]);
    }
    
+   Sxx = 0;
+   for (speak = 0; speak < K; speak++)
+   {
+      Sxx += mdf_inner_prod(st->x+speak*N+st->frame_size, st->x+speak*N+st->frame_size, st->frame_size);
+      power_spectrum_accum(st->X+speak*N, st->Xf, N);
+   }
+   
+   Sff = 0;  
    for (chan = 0; chan < C; chan++)
    {
-#ifdef SMOOTH_BLOCKS
-      spectral_mul_accum(st->X, st->W+chan*N*K*M, st->Y+chan*N, N, M*K);
+#ifdef TWO_PATH
+      /* Compute foreground filter */
+      spectral_mul_accum16(st->X, st->foreground+chan*N*K*M, st->Y+chan*N, N, M*K);
       spx_ifft(st->fft_table, st->Y+chan*N, st->e+chan*N);
+      for (i=0;i<st->frame_size;i++)
+         st->e[chan*N+i] = SUB16(st->input[chan*st->frame_size+i], st->e[chan*N+i+st->frame_size]);
+      Sff += mdf_inner_prod(st->e+chan*N, st->e+chan*N, st->frame_size);
 #endif
    }
    
+   /* Adjust proportional adaption rate */
+   /* FIXME: Adjust that for C, K*/
+   if (st->adapted)
+      mdf_adjust_prop (st->W, N, M, C*K, st->prop);
    /* Compute weight gradient */
-   if (!st->saturated)
+   if (st->saturated == 0)
    {
       for (chan = 0; chan < C; chan++)
       {
@@ -595,16 +806,16 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
          {
             for (j=M-1;j>=0;j--)
             {
-               weighted_spectral_mul_conj(st->power_1, &st->X[(j+1)*N*K+speak*N], st->E+chan*N, st->PHI, N);
+               weighted_spectral_mul_conj(st->power_1, FLOAT_SHL(PSEUDOFLOAT(st->prop[j]),-15), &st->X[(j+1)*N*K+speak*N], st->E+chan*N, st->PHI, N);
                for (i=0;i<N;i++)
-                  st->W[chan*N*K*M + j*N*K + speak*N + i] += MULT16_32_Q15(st->prop[j], st->PHI[i]);
+                  st->W[chan*N*K*M + j*N*K + speak*N + i] += st->PHI[i];
             }
          }
-      }   
+      }
+   } else {
+      st->saturated--;
    }
    
-   st->saturated = 0;
-   
    /* FIXME: MC conversion required */ 
    /* Update weight to prevent circular convolution (MDF / AUMDF) */
    for (chan = 0; chan < C; chan++)
@@ -649,52 +860,141 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
    /* So we can use power_spectrum_accum */ 
    for (i=0;i<=st->frame_size;i++)
       st->Rf[i] = st->Yf[i] = st->Xf[i] = 0;
-   
+      
+   Dbf = 0;
+   See = 0;    
+#ifdef TWO_PATH
+   /* Difference in response, this is used to estimate the variance of our residual power estimate */
    for (chan = 0; chan < C; chan++)
    {
-      /* Compute filter response Y */
       spectral_mul_accum(st->X, st->W+chan*N*K*M, st->Y+chan*N, N, M*K);
       spx_ifft(st->fft_table, st->Y+chan*N, st->y+chan*N);
+      for (i=0;i<st->frame_size;i++)
+         st->e[chan*N+i] = SUB16(st->e[chan*N+i+st->frame_size], st->y[chan*N+i+st->frame_size]);
+      Dbf += 10+mdf_inner_prod(st->e+chan*N, st->e+chan*N, st->frame_size);
+      for (i=0;i<st->frame_size;i++)
+         st->e[chan*N+i] = SUB16(st->input[chan*st->frame_size+i], st->y[chan*N+i+st->frame_size]);
+      See += mdf_inner_prod(st->e+chan*N, st->e+chan*N, st->frame_size);
+   }
+#endif
+
+#ifndef TWO_PATH
+   Sff = See;
+#endif
 
+#ifdef TWO_PATH
+   /* Logic for updating the foreground filter */
+   
+   /* For two time windows, compute the mean of the energy difference, as well as the variance */
+   st->Davg1 = ADD32(MULT16_32_Q15(QCONST16(.6f,15),st->Davg1), MULT16_32_Q15(QCONST16(.4f,15),SUB32(Sff,See)));
+   st->Davg2 = ADD32(MULT16_32_Q15(QCONST16(.85f,15),st->Davg2), MULT16_32_Q15(QCONST16(.15f,15),SUB32(Sff,See)));
+   st->Dvar1 = FLOAT_ADD(FLOAT_MULT(VAR1_SMOOTH, st->Dvar1), FLOAT_MUL32U(MULT16_32_Q15(QCONST16(.4f,15),Sff), MULT16_32_Q15(QCONST16(.4f,15),Dbf)));
+   st->Dvar2 = FLOAT_ADD(FLOAT_MULT(VAR2_SMOOTH, st->Dvar2), FLOAT_MUL32U(MULT16_32_Q15(QCONST16(.15f,15),Sff), MULT16_32_Q15(QCONST16(.15f,15),Dbf)));
+   
+   /* Equivalent float code:
+   st->Davg1 = .6*st->Davg1 + .4*(Sff-See);
+   st->Davg2 = .85*st->Davg2 + .15*(Sff-See);
+   st->Dvar1 = .36*st->Dvar1 + .16*Sff*Dbf;
+   st->Dvar2 = .7225*st->Dvar2 + .0225*Sff*Dbf;
+   */
+   
+   update_foreground = 0;
+   /* Check if we have a statistically significant reduction in the residual echo */
+   /* Note that this is *not* Gaussian, so we need to be careful about the longer tail */
+   if (FLOAT_GT(FLOAT_MUL32U(SUB32(Sff,See),ABS32(SUB32(Sff,See))), FLOAT_MUL32U(Sff,Dbf)))
+      update_foreground = 1;
+   else if (FLOAT_GT(FLOAT_MUL32U(st->Davg1, ABS32(st->Davg1)), FLOAT_MULT(VAR1_UPDATE,(st->Dvar1))))
+      update_foreground = 1;
+   else if (FLOAT_GT(FLOAT_MUL32U(st->Davg2, ABS32(st->Davg2)), FLOAT_MULT(VAR2_UPDATE,(st->Dvar2))))
+      update_foreground = 1;
    
+   /* Do we update? */
+   if (update_foreground)
+   {
+      st->Davg1 = st->Davg2 = 0;
+      st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
+      /* Copy background filter to foreground filter */
+      for (i=0;i<N*M*C*K;i++)
+         st->foreground[i] = EXTRACT16(PSHR32(st->W[i],16));
+      /* Apply a smooth transition so as to not introduce blocking artifacts */
+      for (chan = 0; chan < C; chan++)
+         for (i=0;i<st->frame_size;i++)
+            st->e[chan*N+i+st->frame_size] = MULT16_16_Q15(st->window[i+st->frame_size],st->e[chan*N+i+st->frame_size]) + MULT16_16_Q15(st->window[i],st->y[chan*N+i+st->frame_size]);
+   } else {
+      int reset_background=0;
+      /* Otherwise, check if the background filter is significantly worse */
+      if (FLOAT_GT(FLOAT_MUL32U(NEG32(SUB32(Sff,See)),ABS32(SUB32(Sff,See))), FLOAT_MULT(VAR_BACKTRACK,FLOAT_MUL32U(Sff,Dbf))))
+         reset_background = 1;
+      if (FLOAT_GT(FLOAT_MUL32U(NEG32(st->Davg1), ABS32(st->Davg1)), FLOAT_MULT(VAR_BACKTRACK,st->Dvar1)))
+         reset_background = 1;
+      if (FLOAT_GT(FLOAT_MUL32U(NEG32(st->Davg2), ABS32(st->Davg2)), FLOAT_MULT(VAR_BACKTRACK,st->Dvar2)))
+         reset_background = 1;
+      if (reset_background)
+      {
+         /* Copy foreground filter to background filter */
+         for (i=0;i<N*M*C*K;i++)
+            st->W[i] = SHL32(EXTEND32(st->foreground[i]),16);
+         /* We also need to copy the output so as to get correct adaptation */
+         for (chan = 0; chan < C; chan++)
+         {        
+            for (i=0;i<st->frame_size;i++)
+               st->y[chan*N+i+st->frame_size] = st->e[chan*N+i+st->frame_size];
+            for (i=0;i<st->frame_size;i++)
+               st->e[chan*N+i] = SUB16(st->input[chan*st->frame_size+i], st->y[chan*N+i+st->frame_size]);
+         }        
+         See = Sff;
+         st->Davg1 = st->Davg2 = 0;
+         st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
+      }
+   }
+#endif
+
+   Sey = Syy = Sdd = 0;  
+   for (chan = 0; chan < C; chan++)
+   {    
       /* Compute error signal (for the output with de-emphasis) */ 
       for (i=0;i<st->frame_size;i++)
       {
          spx_word32_t tmp_out;
-#ifdef SMOOTH_BLOCKS
-         spx_word16_t y = MULT16_16_Q15(st->window[i+st->frame_size],st->e[chan*N+i+st->frame_size]) + MULT16_16_Q15(st->window[i],st->y[chan*N+i+st->frame_size]);
-         tmp_out = SUB32(EXTEND32(st->d[chan*N+i+st->frame_size]), EXTEND32(y));
+#ifdef TWO_PATH
+         tmp_out = SUB32(EXTEND32(st->input[chan*st->frame_size+i]), EXTEND32(st->e[chan*N+i+st->frame_size]));
 #else
-         tmp_out = SUB32(EXTEND32(st->d[chan*N+i+st->frame_size]), EXTEND32(st->y[chan*N+i+st->frame_size]));
+         tmp_out = SUB32(EXTEND32(st->input[chan*st->frame_size+i]), EXTEND32(st->y[chan*N+i+st->frame_size]));
 #endif
-
          /* Saturation */
          if (tmp_out>32767)
             tmp_out = 32767;
          else if (tmp_out<-32768)
             tmp_out = -32768;
          tmp_out = ADD32(tmp_out, EXTEND32(MULT16_16_P15(st->preemph, st->memE[chan])));
-         /* This is an arbitrary test for saturation */
-         if (ref[i*C+chan] <= -32000 || ref[i*C+chan] >= 32000)
+      /* This is an arbitrary test for saturation in the microphone signal */
+         if (in[i*C+chan] <= -32000 || in[i*C+chan] >= 32000)
          {
             tmp_out = 0;
+         if (st->saturated == 0)
             st->saturated = 1;
          }
          out[i*C+chan] = (spx_int16_t)tmp_out;
          st->memE[chan] = tmp_out;
       }
-      
+
+#ifdef DUMP_ECHO_CANCEL_DATA
+      dump_audio(in, far_end, out, st->frame_size);
+#endif
+   
       /* Compute error signal (filter update version) */ 
       for (i=0;i<st->frame_size;i++)
       {
+         st->e[chan*N+i+st->frame_size] = st->e[chan*N+i];
          st->e[chan*N+i] = 0;
-         st->e[chan*N+i+st->frame_size] = st->d[chan*N+i+st->frame_size] - st->y[chan*N+i+st->frame_size];
       }
       
       /* Compute a bunch of correlations */
-      See += mdf_inner_prod(st->e+chan*N+st->frame_size, st->e+chan*N+st->frame_size, st->frame_size);
+      /* FIXME: bad merge */
+      Sey += mdf_inner_prod(st->e+chan*N+st->frame_size, st->y+chan*N+st->frame_size, st->frame_size);
       Syy += mdf_inner_prod(st->y+chan*N+st->frame_size, st->y+chan*N+st->frame_size, st->frame_size);
-   
+      Sdd += mdf_inner_prod(st->input+chan*st->frame_size, st->input+chan*st->frame_size, st->frame_size);
+      
       /* Convert error to frequency domain */
       spx_fft(st->fft_table, st->e+chan*N, st->E+chan*N);
       for (i=0;i<st->frame_size;i++)
@@ -704,16 +1004,48 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
       /* Compute power spectrum of echo (X), error (E) and filter response (Y) */
       power_spectrum_accum(st->E+chan*N, st->Rf, N);
       power_spectrum_accum(st->Y+chan*N, st->Yf, N);
+    
    }
-   See = ADD32(See, SHR32(EXTEND32(10000),6));
    
+   /*printf ("%f %f %f %f\n", Sff, See, Syy, Sdd, st->update_cond);*/
+   
+   /* Do some sanity check */
+   if (!(Syy>=0 && Sxx>=0 && See >= 0)
+#ifndef FIXED_POINT
+       || !(Sff < N*1e9 && Syy < N*1e9 && Sxx < N*1e9)
+#endif
+      )
+   {
+      /* Things have gone really bad */
+      st->screwed_up += 50;
+      for (i=0;i<st->frame_size*C;i++)
+         out[i] = 0;
+   } else if (SHR32(Sff, 2) > ADD32(Sdd, SHR32(MULT16_16(N, 10000),6)))
+   {
+      /* AEC seems to add lots of echo instead of removing it, let's see if it will improve */
+      st->screwed_up++;
+   } else {
+      /* Everything's fine */
+      st->screwed_up=0;
+   }
+   if (st->screwed_up>=50)
+   {
+      speex_warning("The echo canceller started acting funny and got slapped (reset). It swears it will behave now.");
+      speex_echo_state_reset(st);
+      return;
+   }
+
+   /* Add a small noise floor to make sure not to have problems when dividing */
+   See = MAX32(See, SHR32(MULT16_16(N, 100),6));
+     
    for (speak = 0; speak < K; speak++)
    {
       Sxx += mdf_inner_prod(st->x+speak*N+st->frame_size, st->x+speak*N+st->frame_size, st->frame_size);
       power_spectrum_accum(st->X+speak*N, st->Xf, N);
    }
+
    
-   /* Smooth echo energy estimate over time */
+   /* Smooth far end energy estimate over time */
    for (j=0;j<=st->frame_size;j++)
       st->power[j] = MULT16_32_Q15(ss_1,st->power[j]) + 1 + MULT16_32_Q15(ss,st->Xf[j]);
 
@@ -754,40 +1086,53 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
    if (FLOAT_GT(st->Pey, st->Pyy))
       st->Pey = st->Pyy;
    /* leak_estimate is the linear regression result */
-   leak_estimate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIVU(st->Pey, st->Pyy),14));
+   st->leak_estimate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIVU(st->Pey, st->Pyy),14));
    /* This looks like a stupid bug, but it's right (because we convert from Q14 to Q15) */
-   if (leak_estimate > 16383)
-      leak_estimate = 32767;
+   if (st->leak_estimate > 16383)
+      st->leak_estimate = 32767;
    else
-      leak_estimate = SHL16(leak_estimate,1);
-   /*printf ("%f\n", leak_estimate);*/
+      st->leak_estimate = SHL16(st->leak_estimate,1);
+   /*printf ("%f\n", st->leak_estimate);*/
    
    /* Compute Residual to Error Ratio */
 #ifdef FIXED_POINT
-   tmp32 = MULT16_32_Q15(leak_estimate,Syy);
-   tmp32 = ADD32(tmp32, SHL32(tmp32,1));
+   tmp32 = MULT16_32_Q15(st->leak_estimate,Syy);
+   tmp32 = ADD32(SHR32(Sxx,13), ADD32(tmp32, SHL32(tmp32,1)));
+   /* Check for y in e (lower bound on RER) */
+   {
+      spx_float_t bound = PSEUDOFLOAT(Sey);
+      bound = FLOAT_DIVU(FLOAT_MULT(bound, bound), PSEUDOFLOAT(ADD32(1,Syy)));
+      if (FLOAT_GT(bound, PSEUDOFLOAT(See)))
+         tmp32 = See;
+      else if (tmp32 < FLOAT_EXTRACT32(bound))
+         tmp32 = FLOAT_EXTRACT32(bound);
+   }
    if (tmp32 > SHR32(See,1))
       tmp32 = SHR32(See,1);
    RER = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32,See),15));
-#else   
-   RER = (.0001*Sxx + 3.*MULT16_32_Q15(leak_estimate,Syy)) / See;
+#else
+   RER = (.0001*Sxx + 3.*MULT16_32_Q15(st->leak_estimate,Syy)) / See;
+   /* Check for y in e (lower bound on RER) */
+   if (RER < Sey*Sey/(1+See*Syy))
+      RER = Sey*Sey/(1+See*Syy);
    if (RER > .5)
       RER = .5;
 #endif
 
    /* We consider that the filter has had minimal adaptation if the following is true*/
-   if (!st->adapted && st->sum_adapt > QCONST32(1,15))
+   if (!st->adapted && st->sum_adapt > QCONST32(M,15) && MULT16_32_Q15(st->leak_estimate,Syy) > MULT16_32_Q15(QCONST16(.03f,15),Syy))
    {
       st->adapted = 1;
    }
 
    if (st->adapted)
    {
+      /* Normal learning rate calculation once we're past the minimal adaptation phase */
       for (i=0;i<=st->frame_size;i++)
       {
          spx_word32_t r, e;
          /* Compute frequency-domain adaptation mask */
-         r = MULT16_32_Q15(leak_estimate,SHL32(st->Yf[i],3));
+         r = MULT16_32_Q15(st->leak_estimate,SHL32(st->Yf[i],3));
          e = SHL32(st->Rf[i],3)+1;
 #ifdef FIXED_POINT
          if (r>SHR32(e,1))
@@ -804,16 +1149,18 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
       /* Temporary adaption rate if filter is not yet adapted enough */
       spx_word16_t adapt_rate=0;
 
-      tmp32 = MULT16_32_Q15(QCONST16(.15f, 15), Sxx);
+      if (Sxx > SHR32(MULT16_16(N, 1000),6)) 
+      {
+         tmp32 = MULT16_32_Q15(QCONST16(.25f, 15), Sxx);
 #ifdef FIXED_POINT
-      if (Sxx > SHR32(See,2))
-         Sxx = SHR32(See,2);
+         if (tmp32 > SHR32(See,2))
+            tmp32 = SHR32(See,2);
 #else
-      if (Sxx > .25*See)
-         Sxx = .25*See;
+         if (tmp32 > .25*See)
+            tmp32 = .25*See;
 #endif
-      adapt_rate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(Sxx, See),15));
-      
+         adapt_rate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32, See),15));
+      }
       for (i=0;i<=st->frame_size;i++)
          st->power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate),ADD32(st->power[i],10)),WEIGHT_SHIFT+1);
 
@@ -823,49 +1170,55 @@ void mc_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_
    }
 
    /* FIXME: MC conversion required */ 
-   /* Compute spectrum of estimated echo for use in an echo post-filter (if necessary)*/
-   if (Yout)
-   {
-      spx_word16_t leak2;
       for (i=0;i<st->frame_size;i++)
          st->last_y[i] = st->last_y[st->frame_size+i];
-      if (st->adapted)
-      {
-         /* If the filter is adapted, take the filtered echo */
-         for (i=0;i<st->frame_size;i++)
-            st->last_y[st->frame_size+i] = ref[i]-out[i];
-      } else {
-         /* If filter isn't adapted yet, all we can do is take the echo signal directly */
-         for (i=0;i<st->frame_size;i++)
-            st->last_y[st->frame_size+i] = echo[i];
-      }
-      
-      /* Apply hanning window (should pre-compute it)*/
-      for (i=0;i<N;i++)
-         st->y[i] = MULT16_16_Q15(st->window[i],st->last_y[i]);
+   if (st->adapted)
+   {
+      /* If the filter is adapted, take the filtered echo */
+      for (i=0;i<st->frame_size;i++)
+         st->last_y[st->frame_size+i] = in[i]-out[i];
+   } else {
+      /* If filter isn't adapted yet, all we can do is take the far end signal directly */
+      /* moved earlier: for (i=0;i<N;i++)
+      st->last_y[i] = st->x[i];*/
+   }
+
+}
+
+/* Compute spectrum of estimated echo for use in an echo post-filter */
+void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *residual_echo, int len)
+{
+   int i;
+   spx_word16_t leak2;
+   int N;
+   
+   N = st->window_size;
+
+   /* Apply hanning window (should pre-compute it)*/
+   for (i=0;i<N;i++)
+      st->y[i] = MULT16_16_Q15(st->window[i],st->last_y[i]);
       
-      /* Compute power spectrum of the echo */
-      spx_fft(st->fft_table, st->y, st->Y);
-      power_spectrum(st->Y, st->Yps, N);
+   /* Compute power spectrum of the echo */
+   spx_fft(st->fft_table, st->y, st->Y);
+   power_spectrum(st->Y, residual_echo, N);
       
 #ifdef FIXED_POINT
-      if (leak_estimate > 16383)
-         leak2 = 32767;
-      else
-         leak2 = SHL16(leak_estimate, 1);
+   if (st->leak_estimate > 16383)
+      leak2 = 32767;
+   else
+      leak2 = SHL16(st->leak_estimate, 1);
 #else
-      if (leak_estimate>.5)
-         leak2 = 1;
-      else
-         leak2 = 2*leak_estimate;
+   if (st->leak_estimate>.5)
+      leak2 = 1;
+   else
+      leak2 = 2*st->leak_estimate;
 #endif
-      /* Estimate residual echo */
-      for (i=0;i<=st->frame_size;i++)
-         Yout[i] = (spx_int32_t)MULT16_32_Q15(leak2,st->Yps[i]);
-   }
+   /* Estimate residual echo */
+   for (i=0;i<=st->frame_size;i++)
+      residual_echo[i] = (spx_int32_t)MULT16_32_Q15(leak2,residual_echo[i]);
+   
 }
 
-
 int mc_echo_ctl(SpeexEchoState *st, int request, void *ptr)
 {
    switch(request)
diff --git a/libspeex/misc.c b/libspeex/misc.c
index 53bdd0b..df44d86 100644
--- a/libspeex/misc.c
+++ b/libspeex/misc.c
@@ -63,74 +63,6 @@ long long spx_mips=0;
 #endif
 
 
-spx_uint32_t be_int(spx_uint32_t i)
-{
-   spx_uint32_t ret=i;
-#ifndef WORDS_BIGENDIAN
-   ret =  i>>24;
-   ret += (i>>8)&0x0000ff00;
-   ret += (i<<8)&0x00ff0000;
-   ret += (i<<24);
-#endif
-   return ret;
-}
-
-spx_uint32_t le_int(spx_uint32_t i)
-{
-   spx_uint32_t ret=i;
-#ifdef WORDS_BIGENDIAN
-   ret =  i>>24;
-   ret += (i>>8)&0x0000ff00;
-   ret += (i<<8)&0x00ff0000;
-   ret += (i<<24);
-#endif
-   return ret;
-}
-
-#if BYTES_PER_CHAR == 2
-void speex_memcpy_bytes(char *dst, char *src, int nbytes)
-{
-  int i;
-  int nchars = nbytes/BYTES_PER_CHAR;
-  for (i=0;i<nchars;i++)
-    dst[i]=src[i];
-  if (nbytes & 1) {
-    /* copy in the last byte */
-    int last_i = nchars;
-    char last_dst_char = dst[last_i];
-    char last_src_char = src[last_i];
-    last_dst_char &= 0xff00;
-    last_dst_char |= (last_src_char & 0x00ff);
-    dst[last_i] = last_dst_char;
-  }
-}
-void speex_memset_bytes(char *dst, char c, int nbytes)
-{
-  int i;
-  spx_int16_t cc = ((c << 8) | c);
-  int nchars = nbytes/BYTES_PER_CHAR;
-  for (i=0;i<nchars;i++)
-    dst[i]=cc;
-  if (nbytes & 1) {
-    /* copy in the last byte */
-    int last_i = nchars;
-    char last_dst_char = dst[last_i];
-    last_dst_char &= 0xff00;
-    last_dst_char |= (c & 0x00ff);
-    dst[last_i] = last_dst_char;
-  }
-}
-#else
-void speex_memcpy_bytes(char *dst, char *src, int nbytes)
-{
-  memcpy(dst, src, nbytes);
-}
-void speex_memset_bytes(char *dst, char src, int nbytes)
-{
-  memset(dst, src, nbytes);
-}
-#endif
-
 #ifndef OVERRIDE_SPEEX_ALLOC
 void *speex_alloc (int size)
 {
@@ -176,7 +108,7 @@ void *speex_move (void *dest, void *src, int n)
 #ifndef OVERRIDE_SPEEX_ERROR
 void speex_error(const char *str)
 {
-   fprintf (stderr, "Fatal error: %s\n", str);
+   fprintf (stderr, "Fatal (internal) error: %s\n", str);
    exit(1);
 }
 #endif
@@ -184,14 +116,27 @@ void speex_error(const char *str)
 #ifndef OVERRIDE_SPEEX_WARNING
 void speex_warning(const char *str)
 {
+#ifndef DISABLE_WARNINGS
    fprintf (stderr, "warning: %s\n", str);
+#endif
 }
 #endif
 
 #ifndef OVERRIDE_SPEEX_WARNING_INT
 void speex_warning_int(const char *str, int val)
 {
+#ifndef DISABLE_WARNINGS
    fprintf (stderr, "warning: %s %d\n", str, val);
+#endif
+}
+#endif
+
+#ifndef OVERRIDE_SPEEX_NOTIFY
+void speex_notify(const char *str)
+{
+#ifndef DISABLE_NOTIFICATIONS
+   fprintf (stderr, "notification: %s\n", str);
+#endif
 }
 #endif
 
@@ -201,7 +146,7 @@ spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed)
    spx_word32_t res;
    *seed = 1664525 * *seed + 1013904223;
    res = MULT16_16(EXTRACT16(SHR32(*seed,16)),std);
-   return PSHR32(SUB32(res, SHR(res, 3)),14);
+   return EXTRACT16(PSHR32(SUB32(res, SHR32(res, 3)),14));
 }
 #else
 spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed)
diff --git a/libspeex/misc.h b/libspeex/misc.h
index 4c70980..c6ea9e7 100644
--- a/libspeex/misc.h
+++ b/libspeex/misc.h
@@ -40,7 +40,7 @@
 #define SPEEX_MINOR_VERSION 1         /**< Minor Speex version. */
 #define SPEEX_MICRO_VERSION 13        /**< Micro Speex version. */
 #define SPEEX_EXTRA_VERSION ""        /**< Extra Speex version. */
-#define SPEEX_VERSION "speex-1.2-beta1"  /**< Speex version string. */
+#define SPEEX_VERSION "speex-1.2beta1"  /**< Speex version string. */
 #endif
 
 /* A couple test to catch stupid option combinations */
@@ -75,10 +75,21 @@
 void print_vec(float *vec, int len, char *name);
 #endif
 
-/** Convert big endian */
-spx_uint32_t be_int(spx_uint32_t i);
 /** Convert little endian */
-spx_uint32_t le_int(spx_uint32_t i);
+static inline spx_int32_t le_int(spx_int32_t i)
+{
+#ifdef WORDS_BIGENDIAN
+   spx_uint32_t ui, ret;
+   ui = i;
+   ret =  ui>>24;
+   ret |= (ui>>8)&0x0000ff00;
+   ret |= (ui<<8)&0x00ff0000;
+   ret |= (ui<<24);
+   return ret;
+#else
+   return i;
+#endif
+}
 
 /** Speex wrapper for calloc. To do your own dynamic allocation, all you need to do is replace this function, speex_realloc and speex_free */
 void *speex_alloc (int size);
@@ -98,21 +109,18 @@ void speex_free_scratch (void *ptr);
 /** Speex wrapper for mem_move */
 void *speex_move (void *dest, void *src, int n);
 
-/** Speex wrapper for memcpy */
-void speex_memcpy_bytes(char *dst, char *src, int nbytes);
-
-/** Speex wrapper for memset */
-void speex_memset_bytes(char *dst, char src, int nbytes);
-
-/** Print error message to stderr */
+/** Abort with an error message to stderr (internal Speex error) */
 void speex_error(const char *str);
 
-/** Print warning message to stderr */
+/** Print warning message to stderr (programming error) */
 void speex_warning(const char *str);
 
 /** Print warning message with integer argument to stderr */
 void speex_warning_int(const char *str, int val);
 
+/** Print notification message to stderr */
+void speex_notify(const char *str);
+
 /** Generate a random number */
 spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed);
 
diff --git a/libspeex/modes.c b/libspeex/modes.c
index 97e7d1e..9a1fe9c 100644
--- a/libspeex/modes.c
+++ b/libspeex/modes.c
@@ -495,7 +495,7 @@ static const SpeexSBMode sb_wb_mode = {
 #endif
    .012,   /*lag_factor*/
    QCONST16(.0002,15), /*lpc_floor*/
-   0.9,
+   QCONST16(0.9f,15),
    {NULL, &wb_submode1, &wb_submode2, &wb_submode3, &wb_submode4, NULL, NULL, NULL},
    3,
    {1, 8, 2, 3, 4, 5, 5, 6, 6, 7, 7},
@@ -541,7 +541,7 @@ static const SpeexSBMode sb_uwb_mode = {
 #endif
    .012,   /*lag_factor*/
    QCONST16(.0002,15), /*lpc_floor*/
-   0.7,
+   QCONST16(0.7f,15),
    {NULL, &wb_submode1, NULL, NULL, NULL, NULL, NULL, NULL},
    1,
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
@@ -608,11 +608,7 @@ static const SpeexSubmode nb_48k_submode = {
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb_48k,
-#ifdef FIXED_POINT
-   22938, 16384, 11796, 18022,
-#else
-   0.7, 0.5, .36, .55,
-#endif
+   QCONST16(.7,15),
    144
 };
 
@@ -622,7 +618,6 @@ static const SpeexNBMode nb_48k_mode = {
    240,    /*frameSize*/
    48,     /*subframeSize*/
    10,     /*lpcSize*/
-   640,    /*bufSize*/
    17,     /*pitchStart*/
    144,    /*pitchEnd*/
    0.9,    /*gamma1*/
@@ -667,7 +662,7 @@ const SpeexMode * speex_lib_get_mode (int mode)
   if (mode == SPEEX_MODEID_NB_48K) return &speex_nb_48k_mode;
 #endif
 
-  if (mode < 0 || mode > SPEEX_NB_MODES) return NULL;
+  if (mode < 0 || mode >= SPEEX_NB_MODES) return NULL;
 
   return speex_mode_list[mode];
 }
diff --git a/libspeex/modes.h b/libspeex/modes.h
index 6a63240..5bf1971 100644
--- a/libspeex/modes.h
+++ b/libspeex/modes.h
@@ -46,6 +46,23 @@
 #define SB_SUBMODES 8
 #define SB_SUBMODE_BITS 3
 
+/* Used internally, NOT TO BE USED in applications */
+/** Used internally*/
+#define SPEEX_GET_PI_GAIN 100
+/** Used internally*/
+#define SPEEX_GET_EXC     101
+/** Used internally*/
+#define SPEEX_GET_INNOV   102
+/** Used internally*/
+#define SPEEX_GET_DTX_STATUS   103
+/** Used internally*/
+#define SPEEX_SET_INNOVATION_SAVE   104
+/** Used internally*/
+#define SPEEX_SET_WIDEBAND   105
+
+/** Used internally*/
+#define SPEEX_GET_STACK   106
+
 
 /** Quantizes LSPs */
 typedef void (*lsp_quant_func)(spx_lsp_t *, spx_lsp_t *, int, SpeexBits *);
@@ -130,7 +147,7 @@ typedef struct SpeexSBMode {
    spx_word16_t gamma2;   /**< Perceptual filter parameter #1 */
    float   lag_factor;    /**< Lag-windowing parameter */
    spx_word16_t   lpc_floor;     /**< Noise floor for LPC analysis */
-   float   folding_gain;
+   spx_word16_t   folding_gain;
 
    const SpeexSubmode *submodes[SB_SUBMODES]; /**< Sub-mode data for the mode */
    int     defaultSubmode; /**< Default sub-mode to use when encoding */
diff --git a/libspeex/nb_celp.c b/libspeex/nb_celp.c
index feedf71..1828aed 100644
--- a/libspeex/nb_celp.c
+++ b/libspeex/nb_celp.c
@@ -87,14 +87,14 @@ const spx_word16_t exc_gain_quant_scal1[2]={11546, 17224};
 
 #else
 
-const float exc_gain_quant_scal3_bound[7]={0.112338, 0.236980, 0.369316, 0.492054, 0.637471, 0.828874, 1.132784};
-const float exc_gain_quant_scal3[8]={0.061130, 0.163546, 0.310413, 0.428220, 0.555887, 0.719055, 0.938694, 1.326874};
-const float exc_gain_quant_scal1_bound[1]={0.87798};
-const float exc_gain_quant_scal1[2]={0.70469, 1.05127};
+const float exc_gain_quant_scal3_bound[7]={0.112338f, 0.236980f, 0.369316f, 0.492054f, 0.637471f, 0.828874f, 1.132784f};
+const float exc_gain_quant_scal3[8]={0.061130f, 0.163546f, 0.310413f, 0.428220f, 0.555887f, 0.719055f, 0.938694f, 1.326874f};
+const float exc_gain_quant_scal1_bound[1]={0.87798f};
+const float exc_gain_quant_scal1[2]={0.70469f, 1.05127f};
 
-#define LSP_MARGIN .002
-#define LSP_DELTA1 .2
-#define LSP_DELTA2 .05
+#define LSP_MARGIN .002f
+#define LSP_DELTA1 .2f
+#define LSP_DELTA2 .05f
 
 #endif
 
@@ -187,7 +187,7 @@ void *nb_encoder_init(const SpeexMode *m)
    st->mem_exc2 = (spx_mem_t*)speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
 
    st->pi_gain = (spx_word32_t*)speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
-   st->innov_save = NULL;
+   st->innov_rms_save = NULL;
    
    st->pitch = (int*)speex_alloc((st->nbSubframes)*sizeof(int));
 
@@ -280,6 +280,8 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
    int pitch_half[2];
    int ol_pitch_id=0;
 #endif
+   spx_word32_t ener=0;
+   spx_word16_t fine_gain;
    spx_word16_t *in = (spx_word16_t*)vin;
 
    st=(EncState *)state;
@@ -432,7 +434,7 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
             ol_gain2=ol2;
          ol_gain2 = sqrt(2*ol_gain2*(ol1+ol2))*1.3*(1-.5*GAIN_SCALING_1*GAIN_SCALING_1*ol_pitch_coef*ol_pitch_coef);
       
-         ol_gain=SHR(sqrt(1+ol_gain2/st->frameSize),SIG_SHIFT);
+         ol_gain=SHR32(sqrt(1+ol_gain2/st->frameSize),SIG_SHIFT);
 
       } else
 #endif
@@ -490,7 +492,7 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
       /*  delta_qual*=.1*(3+st->vbr_quality);*/
       if (st->vbr_enabled) 
       {
-         int mode;
+         spx_int32_t mode;
          int choice=0;
          float min_diff=100;
          mode = 8;
@@ -540,7 +542,7 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
          
          if (st->abr_enabled)
          {
-            int bitrate;
+            spx_int32_t bitrate;
             speex_encoder_ctl(state, SPEEX_GET_BITRATE, &bitrate);
             st->abr_drift+=(bitrate-st->abr_enabled);
             st->abr_drift2 = .95*st->abr_drift2 + .05*(bitrate-st->abr_enabled);
@@ -720,7 +722,6 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
       int   offset;
       spx_word16_t *sw;
       spx_word16_t *exc;
-      spx_sig_t *innov_save = NULL;
       int pitch;
       int response_bound = st->subframeSize;
 #ifdef EPIC_48K
@@ -739,9 +740,6 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
       exc=st->exc+offset;
       /* Weighted signal */
       sw=st->sw+offset;
-      /* Pointer for saving innovation */
-      if (st->innov_save)
-         innov_save = st->innov_save+offset;
       
       /* LSP interpolation (quantized and unquantized) */
       lsp_interpolate(st->old_lsp, lsp, interp_lsp, st->lpcSize, sub, st->nbSubframes);
@@ -838,9 +836,9 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
          for (i=0;i<st->lpcSize;i++)
             st->mem_sw[i]=mem[i];
       
-      /* Compute target signal */
+      /* Compute target signal (saturation prevents overflows on clipped input speech) */
       for (i=0;i<st->subframeSize;i++)
-         target[i]=SUB16(sw[i],PSHR32(ringing[i],1));
+         target[i]=EXTRACT16(SATURATE(SUB32(sw[i],PSHR32(ringing[i],1)),32767));
 
       /* Reset excitation */
       for (i=0;i<st->subframeSize;i++)
@@ -901,75 +899,64 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
       }
 
       /* Quantization of innovation */
-      {
-         spx_word32_t ener=0;
-         spx_word16_t fine_gain;
-
-         for (i=0;i<st->subframeSize;i++)
-            innov[i]=0;
-         
-         for (i=0;i<st->subframeSize;i++)
-            real_exc[i] = SUB16(real_exc[i], PSHR32(exc32[i],SIG_SHIFT-1));
-
-         ener = SHL32(EXTEND32(compute_rms16(real_exc, st->subframeSize)),SIG_SHIFT);
-         
-         /*FIXME: Should use DIV32_16 and make sure result fits in 16 bits */
+      for (i=0;i<st->subframeSize;i++)
+         innov[i]=0;
+      
+      /* FIXME: Make sure this is save from overflows (so far so good) */
+      for (i=0;i<st->subframeSize;i++)
+         real_exc[i] = EXTRACT16(SUB32(EXTEND32(real_exc[i]), PSHR32(exc32[i],SIG_SHIFT-1)));
+      
+      ener = SHL32(EXTEND32(compute_rms16(real_exc, st->subframeSize)),SIG_SHIFT);
+      
+      /*FIXME: Should use DIV32_16 and make sure result fits in 16 bits */
 #ifdef FIXED_POINT
-         {
-            spx_word32_t f = PDIV32(ener,PSHR32(ol_gain,SIG_SHIFT));
-            if (f<=32767)
-               fine_gain = f;
-            else
-               fine_gain = 32767;
-         }
+      {
+         spx_word32_t f = PDIV32(ener,PSHR32(ol_gain,SIG_SHIFT));
+         if (f<=32767)
+            fine_gain = f;
+         else
+            fine_gain = 32767;
+      }
 #else
-         fine_gain = PDIV32_16(ener,PSHR32(ol_gain,SIG_SHIFT));
+      fine_gain = PDIV32_16(ener,PSHR32(ol_gain,SIG_SHIFT));
 #endif
-         /* Calculate gain correction for the sub-frame (if any) */
-         if (SUBMODE(have_subframe_gain)) 
-         {
-            int qe;
-            if (SUBMODE(have_subframe_gain)==3)
-            {
-               qe = scal_quant(fine_gain, exc_gain_quant_scal3_bound, 8);
-               speex_bits_pack(bits, qe, 3);
-               ener=MULT16_32_Q14(exc_gain_quant_scal3[qe],ol_gain);
-            } else {
-               qe = scal_quant(fine_gain, exc_gain_quant_scal1_bound, 2);
-               speex_bits_pack(bits, qe, 1);
-               ener=MULT16_32_Q14(exc_gain_quant_scal1[qe],ol_gain);               
-            }
-         } else {
-            ener=ol_gain;
-         }
-
-         /*printf ("%f %f\n", ener, ol_gain);*/
-
-         /* Normalize innovation */
-         signal_div(target, target, ener, st->subframeSize);
-
-         /* Quantize innovation */
-         if (SUBMODE(innovation_quant))
+      /* Calculate gain correction for the sub-frame (if any) */
+      if (SUBMODE(have_subframe_gain)) 
+      {
+         int qe;
+         if (SUBMODE(have_subframe_gain)==3)
          {
-            /* Codebook search */
-            SUBMODE(innovation_quant)(target, interp_qlpc, bw_lpc1, bw_lpc2, 
-                                      SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
-                                      innov, syn_resp, bits, stack, st->complexity, SUBMODE(double_codebook));
-            
-            /* De-normalize innovation and update excitation */
-            signal_mul(innov, innov, ener, st->subframeSize);
-
-            for (i=0;i<st->subframeSize;i++)
-               exc[i] = EXTRACT16(PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT));
+            qe = scal_quant(fine_gain, exc_gain_quant_scal3_bound, 8);
+            speex_bits_pack(bits, qe, 3);
+            ener=MULT16_32_Q14(exc_gain_quant_scal3[qe],ol_gain);
          } else {
-            speex_error("No fixed codebook");
+            qe = scal_quant(fine_gain, exc_gain_quant_scal1_bound, 2);
+            speex_bits_pack(bits, qe, 1);
+            ener=MULT16_32_Q14(exc_gain_quant_scal1[qe],ol_gain);               
          }
+      } else {
+         ener=ol_gain;
+      }
+      
+      /*printf ("%f %f\n", ener, ol_gain);*/
+      
+      /* Normalize innovation */
+      signal_div(target, target, ener, st->subframeSize);
+      
+      /* Quantize innovation */
+      if (SUBMODE(innovation_quant))
+      {
+         /* Codebook search */
+         SUBMODE(innovation_quant)(target, interp_qlpc, bw_lpc1, bw_lpc2, 
+                  SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
+                  innov, syn_resp, bits, stack, st->complexity, SUBMODE(double_codebook));
+         
+         /* De-normalize innovation and update excitation */
+         signal_mul(innov, innov, ener, st->subframeSize);
+         
+         for (i=0;i<st->subframeSize;i++)
+            exc[i] = EXTRACT16(SATURATE32(PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT),32767));
 
-         if (innov_save)
-         {
-            for (i=0;i<st->subframeSize;i++)
-               innov_save[i] = innov[i];
-         }
          /* In some (rare) modes, we do a second search (more bits) to reduce noise even more */
          if (SUBMODE(double_codebook)) {
             char *tmp_stack=stack;
@@ -978,23 +965,26 @@ int nb_encode(void *state, void *vin, SpeexBits *bits)
             for (i=0;i<st->subframeSize;i++)
                innov2[i]=0;
             for (i=0;i<st->subframeSize;i++)
-               target[i]=MULT16_16_P13(QCONST16(2.2,13), target[i]);
+               target[i]=MULT16_16_P13(QCONST16(2.2f,13), target[i]);
             SUBMODE(innovation_quant)(target, interp_qlpc, bw_lpc1, bw_lpc2, 
                                       SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
                                       innov2, syn_resp, bits, stack, st->complexity, 0);
-            signal_mul(innov2, innov2, MULT16_32_Q15(QCONST16(0.454545,15),ener), st->subframeSize);
+            signal_mul(innov2, innov2, MULT16_32_Q15(QCONST16(0.454545f,15),ener), st->subframeSize);
             for (i=0;i<st->subframeSize;i++)
-               exc[i] = ADD32(exc[i],PSHR32(innov2[i],SIG_SHIFT));
-            if (innov_save)
-            {
-               for (i=0;i<st->subframeSize;i++)
-                  innov_save[i] = ADD32(innov_save[i],innov2[i]);
-            }
+               innov[i] = ADD32(innov[i],innov2[i]);
             stack = tmp_stack;
          }
-
+         for (i=0;i<st->subframeSize;i++)
+            exc[i] = EXTRACT16(SATURATE32(PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT),32767));
+         if (st->innov_rms_save)
+         {
+            st->innov_rms_save[sub] = compute_rms(innov, st->subframeSize);
+         }
+      } else {
+         speex_error("No fixed codebook");
       }
 
+
       for (i=0;i<st->subframeSize;i++)
          sw[i] = exc[i];
       /* Final signal synthesis from excitation */
@@ -1145,7 +1135,7 @@ const spx_word16_t attenuation[10] = {1., 0.961, 0.852, 0.698, 0.527, 0.368, 0.2
 
 static void nb_decode_lost(DecState *st, spx_word16_t *out, char *stack)
 {
-   int i, sub;
+   int i;
    int pitch_val;
    spx_word16_t pitch_gain;
    spx_word16_t fact;
@@ -1166,7 +1156,7 @@ static void nb_decode_lost(DecState *st, spx_word16_t *out, char *stack)
    pitch_gain = st->last_pitch_gain;
    if (pitch_gain>54)
       pitch_gain = 54;
-   pitch_gain = SHL(pitch_gain, 9);
+   pitch_gain = SHL16(pitch_gain, 9);
 #else   
    pitch_gain = GAIN_SCALING_1*st->last_pitch_gain;
    if (pitch_gain>.85)
@@ -1200,7 +1190,7 @@ static void nb_decode_lost(DecState *st, spx_word16_t *out, char *stack)
    
    st->first = 0;
    st->count_lost++;
-   st->pitch_gain_buf[st->pitch_gain_buf_idx++] = PSHR(pitch_gain,9);
+   st->pitch_gain_buf[st->pitch_gain_buf_idx++] = PSHR16(pitch_gain,9);
    if (st->pitch_gain_buf_idx > 2) /* rollover */
       st->pitch_gain_buf_idx = 0;
 }
@@ -1226,7 +1216,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
    VARDECL(spx_lsp_t *qlsp);
    spx_word16_t pitch_average=0;
 #ifdef EPIC_48K
-   int pitch_half[2];
+   int pitch_half[2] = {0, 0};
    int ol_pitch_id=0;
 #endif
    spx_word16_t *out = (spx_word16_t*)vout;
@@ -1267,7 +1257,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
             speex_mode_query(&speex_wb_mode, SPEEX_SUBMODE_BITS_PER_FRAME, &advance);
             if (advance < 0)
             {
-               speex_warning ("Invalid wideband mode encountered. Corrupted stream?");
+               speex_notify("Invalid mode encountered. The stream is corrupted.");
                return -2;
             } 
             advance -= (SB_SUBMODE_BITS+1);
@@ -1282,7 +1272,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
                speex_mode_query(&speex_wb_mode, SPEEX_SUBMODE_BITS_PER_FRAME, &advance);
                if (advance < 0)
                {
-                  speex_warning ("Invalid wideband mode encountered: corrupted stream?");
+                  speex_notify("Invalid mode encountered. The stream is corrupted.");
                   return -2;
                } 
                advance -= (SB_SUBMODE_BITS+1);
@@ -1290,7 +1280,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
                wideband = speex_bits_unpack_unsigned(bits, 1);
                if (wideband)
                {
-                  speex_warning ("More than two wideband layers found: corrupted stream?");
+                  speex_notify("More than two wideband layers found. The stream is corrupted.");
                   return -2;
                }
 
@@ -1315,7 +1305,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
                return ret;
          } else if (m>8) /* Invalid mode */
          {
-            speex_warning("Invalid mode encountered: corrupted stream?");
+            speex_notify("Invalid mode encountered. The stream is corrupted.");
             return -2;
          }
       
@@ -1338,7 +1328,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
    {
       VARDECL(spx_coef_t *lpc);
       ALLOC(lpc, st->lpcSize, spx_coef_t);
-      bw_lpc(GAMMA_SCALING*.93, st->interp_qlpc, lpc, st->lpcSize);
+      bw_lpc(QCONST16(0.93f,15), st->interp_qlpc, lpc, st->lpcSize);
       {
          float innov_gain=0;
          float pgain=GAIN_SCALING_1*st->last_pitch_gain;
@@ -1426,6 +1416,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
       int qe;
       qe = speex_bits_unpack_unsigned(bits, 5);
 #ifdef FIXED_POINT
+      /* FIXME: Perhaps we could slightly lower the gain here when the output is going to saturate? */
       ol_gain = MULT16_32_Q15(28406,ol_gain_table[qe]);
 #else
       ol_gain = SIG_SCALING*exp(qe/3.5);
@@ -1458,7 +1449,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
       int offset;
       spx_word16_t *exc;
       spx_word16_t *sp;
-      spx_sig_t *innov_save = NULL;
+      spx_word16_t *innov_save = NULL;
       spx_word16_t tmp;
 
 #ifdef EPIC_48K
@@ -1535,7 +1526,11 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
 #ifdef EPIC_48K
          }
 #endif
-
+         /* Ensuring that things aren't blowing up as would happen if e.g. an encoder is 
+         crafting packets to make us produce NaNs and slow down the decoder (vague DoS threat).
+         We can probably be even more aggressive and limit to 15000 or so. */
+         sanitize_values32(exc32, NEG32(QCONST32(32000,SIG_SHIFT-1)), QCONST32(32000,SIG_SHIFT-1), st->subframeSize);
+         
          tmp = gain_3tap_to_1tap(pitch_gain);
 
          pitch_average += tmp;
@@ -1576,16 +1571,38 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
          {
             /*Fixed codebook contribution*/
             SUBMODE(innovation_unquant)(innov, SUBMODE(innovation_params), st->subframeSize, bits, stack, &st->seed);
+            /* De-normalize innovation and update excitation */
+#ifdef FIXED_POINT
+            signal_mul(innov, innov, ener, st->subframeSize);
+#else
+            signal_mul(innov, innov, ener, st->subframeSize);
+#endif
+            /* Decode second codebook (only for some modes) */
+            if (SUBMODE(double_codebook))
+            {
+               char *tmp_stack=stack;
+               VARDECL(spx_sig_t *innov2);
+               ALLOC(innov2, st->subframeSize, spx_sig_t);
+               for (i=0;i<st->subframeSize;i++)
+                  innov2[i]=0;
+               SUBMODE(innovation_unquant)(innov2, SUBMODE(innovation_params), st->subframeSize, bits, stack, &st->seed);
+               signal_mul(innov2, innov2, MULT16_32_Q15(QCONST16(0.454545f,15),ener), st->subframeSize);
+               for (i=0;i<st->subframeSize;i++)
+                  innov[i] = ADD32(innov[i], innov2[i]);
+               stack = tmp_stack;
+            }
+            for (i=0;i<st->subframeSize;i++)
+               exc[i]=EXTRACT16(SATURATE32(PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT),32767));
+            /*print_vec(exc, 40, "innov");*/
+            if (innov_save)
+            {
+               for (i=0;i<st->subframeSize;i++)
+                  innov_save[i] = EXTRACT16(PSHR32(innov[i], SIG_SHIFT));
+            }
          } else {
             speex_error("No fixed codebook");
          }
 
-         /* De-normalize innovation and update excitation */
-#ifdef FIXED_POINT
-         signal_mul(innov, innov, ener, st->subframeSize);
-#else
-         signal_mul(innov, innov, ener, st->subframeSize);
-#endif
          /*Vocoder mode*/
          if (st->submodeID==1) 
          {
@@ -1617,35 +1634,8 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
                st->voc_mean = .95*st->voc_mean + .05*exc[i];
                exc[i]-=st->voc_mean;
             }
-         } else {
-            for (i=0;i<st->subframeSize;i++)
-               exc[i]=PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT);
-            /*print_vec(exc, 40, "innov");*/
-         }
-         if (innov_save)
-         {
-            for (i=0;i<st->subframeSize;i++)
-               innov_save[i] = innov[i];
-         }
-         /* Decode second codebook (only for some modes) */
-         if (SUBMODE(double_codebook))
-         {
-            char *tmp_stack=stack;
-            VARDECL(spx_sig_t *innov2);
-            ALLOC(innov2, st->subframeSize, spx_sig_t);
-            for (i=0;i<st->subframeSize;i++)
-               innov2[i]=0;
-            SUBMODE(innovation_unquant)(innov2, SUBMODE(innovation_params), st->subframeSize, bits, stack, &st->seed);
-            signal_mul(innov2, innov2, MULT16_32_Q15(QCONST16(0.454545,15),ener), st->subframeSize);
-            for (i=0;i<st->subframeSize;i++)
-               exc[i] = ADD16(exc[i],PSHR32(innov2[i],SIG_SHIFT));
-            if (innov_save)
-            {
-               for (i=0;i<st->subframeSize;i++)
-                  innov_save[i] = ADD32(innov_save[i],innov2[i]);
-            }
-            stack = tmp_stack;
          }
+
       }
    }
    
@@ -1712,7 +1702,7 @@ int nb_decode(void *state, SpeexBits *bits, void *vout)
          for (i=0;i<st->lpcSize;i+=2)
          {
             /*pi_g += -st->interp_qlpc[i] +  st->interp_qlpc[i+1];*/
-            pi_g = ADD32(pi_g, SUB32(EXTEND32(st->interp_qlpc[i+1]),EXTEND32(st->interp_qlpc[i])));
+            pi_g = ADD32(pi_g, SUB32(EXTEND32(ak[i+1]),EXTEND32(ak[i])));
          }
          st->pi_gain[sub] = pi_g;
       }
@@ -1759,40 +1749,40 @@ int nb_encoder_ctl(void *state, int request, void *ptr)
    switch(request)
    {
    case SPEEX_GET_FRAME_SIZE:
-      (*(int*)ptr) = st->frameSize;
+      (*(spx_int32_t*)ptr) = st->frameSize;
       break;
    case SPEEX_SET_LOW_MODE:
    case SPEEX_SET_MODE:
-      st->submodeSelect = st->submodeID = (*(int*)ptr);
+      st->submodeSelect = st->submodeID = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_LOW_MODE:
    case SPEEX_GET_MODE:
-      (*(int*)ptr) = st->submodeID;
+      (*(spx_int32_t*)ptr) = st->submodeID;
       break;
    case SPEEX_SET_VBR:
-      st->vbr_enabled = (*(int*)ptr);
+      st->vbr_enabled = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_VBR:
-      (*(int*)ptr) = st->vbr_enabled;
+      (*(spx_int32_t*)ptr) = st->vbr_enabled;
       break;
    case SPEEX_SET_VAD:
-      st->vad_enabled = (*(int*)ptr);
+      st->vad_enabled = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_VAD:
-      (*(int*)ptr) = st->vad_enabled;
+      (*(spx_int32_t*)ptr) = st->vad_enabled;
       break;
    case SPEEX_SET_DTX:
-      st->dtx_enabled = (*(int*)ptr);
+      st->dtx_enabled = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_DTX:
-      (*(int*)ptr) = st->dtx_enabled;
+      (*(spx_int32_t*)ptr) = st->dtx_enabled;
       break;
    case SPEEX_SET_ABR:
       st->abr_enabled = (*(spx_int32_t*)ptr);
       st->vbr_enabled = st->abr_enabled!=0;
       if (st->vbr_enabled) 
       {
-         int i=10;
+         spx_int32_t i=10;
          spx_int32_t rate, target;
          float vbr_qual;
          target = (*(spx_int32_t*)ptr);
@@ -1825,7 +1815,7 @@ int nb_encoder_ctl(void *state, int request, void *ptr)
       break;
    case SPEEX_SET_QUALITY:
       {
-         int quality = (*(int*)ptr);
+         int quality = (*(spx_int32_t*)ptr);
          if (quality < 0)
             quality = 0;
          if (quality > 10)
@@ -1834,7 +1824,7 @@ int nb_encoder_ctl(void *state, int request, void *ptr)
       }
       break;
    case SPEEX_SET_COMPLEXITY:
-      st->complexity = (*(int*)ptr);
+      st->complexity = (*(spx_int32_t*)ptr);
       if (st->complexity<0)
          st->complexity=0;
       break;
@@ -1843,7 +1833,7 @@ int nb_encoder_ctl(void *state, int request, void *ptr)
       break;
    case SPEEX_SET_BITRATE:
       {
-         int i=10;
+         spx_int32_t i=10;
          spx_int32_t rate, target;
          target = (*(spx_int32_t*)ptr);
          while (i>=0)
@@ -1884,21 +1874,21 @@ int nb_encoder_ctl(void *state, int request, void *ptr)
       }
       break;
    case SPEEX_SET_SUBMODE_ENCODING:
-      st->encode_submode = (*(int*)ptr);
+      st->encode_submode = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_SUBMODE_ENCODING:
-      (*(int*)ptr) = st->encode_submode;
+      (*(spx_int32_t*)ptr) = st->encode_submode;
       break;
    case SPEEX_GET_LOOKAHEAD:
-      (*(int*)ptr)=(st->windowSize-st->frameSize);
+      (*(spx_int32_t*)ptr)=(st->windowSize-st->frameSize);
       break;
    case SPEEX_SET_PLC_TUNING:
-      st->plc_tuning = (*(int*)ptr);
+      st->plc_tuning = (*(spx_int32_t*)ptr);
       if (st->plc_tuning>100)
          st->plc_tuning=100;
       break;
    case SPEEX_GET_PLC_TUNING:
-      (*(int*)ptr)=(st->plc_tuning);
+      (*(spx_int32_t*)ptr)=(st->plc_tuning);
       break;
    case SPEEX_SET_VBR_MAX_BITRATE:
       st->vbr_max = (*(spx_int32_t*)ptr);
@@ -1925,19 +1915,21 @@ int nb_encoder_ctl(void *state, int request, void *ptr)
    case SPEEX_GET_EXC:
       {
          int i;
-         spx_word16_t *e = (spx_word16_t*)ptr;
-         for (i=0;i<st->frameSize;i++)
-            e[i]=st->exc[i];
+         for (i=0;i<st->nbSubframes;i++)
+            ((spx_word16_t*)ptr)[i] = compute_rms16(st->exc+i*st->subframeSize, st->subframeSize);
       }
       break;
    case SPEEX_GET_RELATIVE_QUALITY:
       (*(float*)ptr)=st->relative_quality;
       break;
    case SPEEX_SET_INNOVATION_SAVE:
-      st->innov_save = (spx_sig_t*)ptr;
+      st->innov_rms_save = (spx_word16_t*)ptr;
       break;
    case SPEEX_SET_WIDEBAND:
-      st->isWideband = *((int*)ptr);
+      st->isWideband = *((spx_int32_t*)ptr);
+      break;
+   case SPEEX_GET_STACK:
+      *((char**)ptr) = st->stack;
       break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
@@ -1954,20 +1946,20 @@ int nb_decoder_ctl(void *state, int request, void *ptr)
    {
    case SPEEX_SET_LOW_MODE:
    case SPEEX_SET_MODE:
-      st->submodeID = (*(int*)ptr);
+      st->submodeID = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_LOW_MODE:
    case SPEEX_GET_MODE:
-      (*(int*)ptr) = st->submodeID;
+      (*(spx_int32_t*)ptr) = st->submodeID;
       break;
    case SPEEX_SET_ENH:
-      st->lpc_enh_enabled = *((int*)ptr);
+      st->lpc_enh_enabled = *((spx_int32_t*)ptr);
       break;
    case SPEEX_GET_ENH:
-      *((int*)ptr) = st->lpc_enh_enabled;
+      *((spx_int32_t*)ptr) = st->lpc_enh_enabled;
       break;
    case SPEEX_GET_FRAME_SIZE:
-      (*(int*)ptr) = st->frameSize;
+      (*(spx_int32_t*)ptr) = st->frameSize;
       break;
    case SPEEX_GET_BITRATE:
       if (st->submodes[st->submodeID])
@@ -2007,13 +1999,13 @@ int nb_decoder_ctl(void *state, int request, void *ptr)
       }
       break;
    case SPEEX_SET_SUBMODE_ENCODING:
-      st->encode_submode = (*(int*)ptr);
+      st->encode_submode = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_SUBMODE_ENCODING:
-      (*(int*)ptr) = st->encode_submode;
+      (*(spx_int32_t*)ptr) = st->encode_submode;
       break;
    case SPEEX_GET_LOOKAHEAD:
-      (*(int*)ptr)=st->subframeSize;
+      (*(spx_int32_t*)ptr)=st->subframeSize;
       break;
    case SPEEX_SET_HIGHPASS:
       st->highpass_enabled = (*(spx_int32_t*)ptr);
@@ -2033,19 +2025,21 @@ int nb_decoder_ctl(void *state, int request, void *ptr)
    case SPEEX_GET_EXC:
       {
          int i;
-         spx_word16_t *e = (spx_word16_t*)ptr;
-         for (i=0;i<st->frameSize;i++)
-            e[i]=st->exc[i];
+         for (i=0;i<st->nbSubframes;i++)
+            ((spx_word16_t*)ptr)[i] = compute_rms16(st->exc+i*st->subframeSize, st->subframeSize);
       }
       break;
    case SPEEX_GET_DTX_STATUS:
-      *((int*)ptr) = st->dtx_enabled;
+      *((spx_int32_t*)ptr) = st->dtx_enabled;
       break;
    case SPEEX_SET_INNOVATION_SAVE:
-      st->innov_save = (spx_sig_t*)ptr;
+      st->innov_save = (spx_word16_t*)ptr;
       break;
    case SPEEX_SET_WIDEBAND:
-      st->isWideband = *((int*)ptr);
+      st->isWideband = *((spx_int32_t*)ptr);
+      break;
+   case SPEEX_GET_STACK:
+      *((char**)ptr) = st->stack;
       break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
diff --git a/libspeex/nb_celp.h b/libspeex/nb_celp.h
index a72c2b1..1ebf717 100644
--- a/libspeex/nb_celp.h
+++ b/libspeex/nb_celp.h
@@ -96,12 +96,12 @@ typedef struct EncState {
    spx_mem_t *mem_exc2;          /**< Filter memory for excitation (whole frame) */
    spx_mem_t mem_hp[2];          /**< High-pass filter memory */
    spx_word32_t *pi_gain;        /**< Gain of LPC filter at theta=pi (fe/2) */
-   spx_sig_t *innov_save;        /**< If non-NULL, innovation is copied here */
+   spx_word16_t *innov_rms_save; /**< If non-NULL, innovation RMS is copied here */
          
    VBRState *vbr;                /**< State of the VBR data */
    float  vbr_quality;           /**< Quality setting for VBR encoding */
    float  relative_quality;      /**< Relative quality that will be needed by VBR */
-   int    vbr_enabled;           /**< 1 for enabling VBR, 0 otherwise */
+   spx_int32_t vbr_enabled;      /**< 1 for enabling VBR, 0 otherwise */
    spx_int32_t vbr_max;          /**< Max bit-rate allowed in VBR mode */
    int    vad_enabled;           /**< 1 for enabling VAD, 0 otherwise */
    int    dtx_enabled;           /**< 1 for enabling DTX, 0 otherwise */
@@ -148,7 +148,7 @@ typedef struct DecState {
    spx_mem_t *mem_sp;           /**< Filter memory for synthesis signal */
    spx_mem_t mem_hp[2];         /**< High-pass filter memory */
    spx_word32_t *pi_gain;       /**< Gain of LPC filter at theta=pi (fe/2) */
-   spx_sig_t *innov_save;       /** If non-NULL, innovation is copied here */
+   spx_word16_t *innov_save;       /** If non-NULL, innovation is copied here */
    
    /* This is used in packet loss concealment */
    int    last_pitch;           /**< Pitch of last correctly decoded frame */
diff --git a/libspeex/preprocess.c b/libspeex/preprocess.c
index 02724f2..b5caf4b 100644
--- a/libspeex/preprocess.c
+++ b/libspeex/preprocess.c
@@ -1,6 +1,6 @@
-/* Copyright (C) 2003 Epic Games 
-   Written by Jean-Marc Valin
-
+/* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin)
+   Copyright (C) 2004-2006 Epic Games 
+   
    File: preprocess.c
    Preprocessor with denoising based on the algorithm by Ephraim and Malah
 
@@ -31,96 +31,369 @@
    POSSIBILITY OF SUCH DAMAGE.
 */
 
+
+/*
+   Recommended papers:
+   
+   Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
+   short-time spectral amplitude estimator". IEEE Transactions on Acoustics, 
+   Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984.
+   
+   Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
+   log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and 
+   Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985.
+   
+   I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments".
+   Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001.
+
+   Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic 
+   approach to combined acoustic echo cancellation and noise reduction". IEEE 
+   Transactions on Speech and Audio Processing, 2002.
+   
+   J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation
+   of simultaneous non-stationary sources". In Proceedings IEEE International 
+   Conference on Acoustics, Speech, and Signal Processing, 2004.
+*/
+
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 
 #include <math.h>
 #include "speex/speex_preprocess.h"
+#include "speex/speex_echo.h"
 #include "misc.h"
-#include "smallft.h"
-
-#define max(a,b) ((a) > (b) ? (a) : (b))
-#define min(a,b) ((a) < (b) ? (a) : (b))
+#include "fftwrap.h"
+#include "filterbank.h"
+#include "math_approx.h"
 
 #ifndef M_PI
 #define M_PI 3.14159263
 #endif
 
-#define SQRT_M_PI_2 0.88623
-#define LOUDNESS_EXP 2.5
+#define LOUDNESS_EXP 5.f
+#define AMP_SCALE .001f
+#define AMP_SCALE_1 1000.f
+      
+#define NB_BANDS 24
+
+#define SPEECH_PROB_START_DEFAULT       QCONST16(0.35f,15)
+#define SPEECH_PROB_CONTINUE_DEFAULT    QCONST16(0.20f,15)
+#define NOISE_SUPPRESS_DEFAULT       -15
+#define ECHO_SUPPRESS_DEFAULT        -40
+#define ECHO_SUPPRESS_ACTIVE_DEFAULT -15
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define SQR(x) ((x)*(x))
+#define SQR16(x) (MULT16_16((x),(x)))
+#define SQR16_Q15(x) (MULT16_16_Q15((x),(x)))
+
+#ifdef FIXED_POINT
+static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b)
+{
+   if (SHR32(a,7) >= b)
+   {
+      return 32767;
+   } else {
+      if (b>=QCONST32(1,23))
+      {
+         a = SHR32(a,8);
+         b = SHR32(b,8);
+      }
+      if (b>=QCONST32(1,19))
+      {
+         a = SHR32(a,4);
+         b = SHR32(b,4);
+      }
+      if (b>=QCONST32(1,15))
+      {
+         a = SHR32(a,4);
+         b = SHR32(b,4);
+      }
+      a = SHL32(a,8);
+      return PDIV32_16(a,b);
+   }
+   
+}
+static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b)
+{
+   if (SHR32(a,15) >= b)
+   {
+      return 32767;
+   } else {
+      if (b>=QCONST32(1,23))
+      {
+         a = SHR32(a,8);
+         b = SHR32(b,8);
+      }
+      if (b>=QCONST32(1,19))
+      {
+         a = SHR32(a,4);
+         b = SHR32(b,4);
+      }
+      if (b>=QCONST32(1,15))
+      {
+         a = SHR32(a,4);
+         b = SHR32(b,4);
+      }
+      a = SHL32(a,15)-a;
+      return DIV32_16(a,b);
+   }
+}
+#define SNR_SCALING 256.f
+#define SNR_SCALING_1 0.0039062f
+#define SNR_SHIFT 8
+
+#define FRAC_SCALING 32767.f
+#define FRAC_SCALING_1 3.0518e-05
+#define FRAC_SHIFT 1
+
+#define EXPIN_SCALING 2048.f
+#define EXPIN_SCALING_1 0.00048828f
+#define EXPIN_SHIFT 11
+#define EXPOUT_SCALING_1 1.5259e-05
+
+#define NOISE_SHIFT 7
 
-#define NB_BANDS 8
+#else
+
+#define DIV32_16_Q8(a,b) ((a)/(b))
+#define DIV32_16_Q15(a,b) ((a)/(b))
+#define SNR_SCALING 1.f
+#define SNR_SCALING_1 1.f
+#define SNR_SHIFT 0
+#define FRAC_SCALING 1.f
+#define FRAC_SCALING_1 1.f
+#define FRAC_SHIFT 0
+#define NOISE_SHIFT 0
 
-#define SPEEX_PROB_START_DEFAULT    0.35f
-#define SPEEX_PROB_CONTINUE_DEFAULT 0.20f
+#define EXPIN_SCALING 1.f
+#define EXPIN_SCALING_1 1.f
+#define EXPOUT_SCALING_1 1.f
 
-#define ZMIN .1
-#define ZMAX .316
-#define ZMIN_1 10
-#define LOG_MIN_MAX_1 0.86859
+#endif
 
-static void conj_window(float *w, int len)
+/** Speex pre-processor state. */
+struct SpeexPreprocessState_ {
+   /* Basic info */
+   int    frame_size;        /**< Number of samples processed each time */
+   int    ps_size;           /**< Number of points in the power spectrum */
+   int    sampling_rate;     /**< Sampling rate of the input/output */
+   int    nbands;
+   FilterBank *bank;
+   
+   /* Parameters */
+   int    denoise_enabled;
+   int    vad_enabled;
+   int    dereverb_enabled;
+   spx_word16_t  reverb_decay;
+   spx_word16_t  reverb_level;
+   spx_word16_t speech_prob_start;
+   spx_word16_t speech_prob_continue;
+   int    noise_suppress;
+   int    echo_suppress;
+   int    echo_suppress_active;
+   SpeexEchoState *echo_state;
+   
+   /* DSP-related arrays */
+   spx_word16_t *frame;      /**< Processing frame (2*ps_size) */
+   spx_word16_t *ft;         /**< Processing frame in freq domain (2*ps_size) */
+   spx_word32_t *ps;         /**< Current power spectrum */
+   spx_word16_t *gain2;      /**< Adjusted gains */
+   spx_word16_t *gain_floor; /**< Minimum gain allowed */
+   spx_word16_t *window;     /**< Analysis/Synthesis window */
+   spx_word32_t *noise;      /**< Noise estimate */
+   spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */
+   spx_word32_t *old_ps;     /**< Power spectrum for last frame */
+   spx_word16_t *gain;       /**< Ephraim Malah gain */
+   spx_word16_t *prior;      /**< A-priori SNR */
+   spx_word16_t *post;       /**< A-posteriori SNR */
+
+   spx_word32_t *S;          /**< Smoothed power spectrum */
+   spx_word32_t *Smin;       /**< See Cohen paper */
+   spx_word32_t *Stmp;       /**< See Cohen paper */
+   int *update_prob;       /**< Propability of speech presence for noise update */
+
+   spx_word16_t *zeta;       /**< Smoothed a priori SNR */
+   spx_word32_t *echo_noise;
+   spx_word32_t *residual_echo;
+
+   /* Misc */
+   spx_word16_t *inbuf;      /**< Input buffer (overlapped analysis) */
+   spx_word16_t *outbuf;     /**< Output buffer (for overlap and add) */
+
+   /* AGC stuff, only for floating point for now */
+#ifndef FIXED_POINT
+   int    agc_enabled;
+   float  agc_level;
+   float  loudness_accum;
+   float *loudness_weight;   /**< Perceptual loudness curve */
+   float  loudness;          /**< Loudness estimate */
+   float  agc_gain;          /**< Current AGC gain */
+   int    nb_loudness_adapt; /**< Number of frames used for loudness adaptation so far */
+   float  max_gain;          /**< Maximum gain allowed */
+   float  max_increase_step; /**< Maximum increase in gain from one frame to another */
+   float  max_decrease_step; /**< Maximum decrease in gain from one frame to another */
+   float  prev_loudness;     /**< Loudness of previous frame */
+   float  init_max;          /**< Current gain limit during initialisation */
+#endif
+   int    nb_adapt;          /**< Number of frames used for adaptation so far */
+   int    was_speech;
+   int    min_count;         /**< Number of frames processed so far */
+   void  *fft_lookup;        /**< Lookup table for the FFT */
+#ifdef FIXED_POINT
+   int    frame_shift;
+#endif
+};
+
+
+static void conj_window(spx_word16_t *w, int len)
 {
    int i;
    for (i=0;i<len;i++)
    {
-      float x=4*((float)i)/len;
+      spx_word16_t tmp;
+#ifdef FIXED_POINT
+      spx_word16_t x = DIV32_16(MULT16_16(32767,i),len);
+#else      
+      spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len);
+#endif
       int inv=0;
-      if (x<1)
+      if (x<QCONST16(1.f,13))
       {
-      } else if (x<2)
+      } else if (x<QCONST16(2.f,13))
       {
-         x=2-x;
+         x=QCONST16(2.f,13)-x;
          inv=1;
-      } else if (x<3)
+      } else if (x<QCONST16(3.f,13))
       {
-         x=x-2;
+         x=x-QCONST16(2.f,13);
          inv=1;
       } else {
-         x=4-x;
+         x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */
       }
-      x*=1.9979;
-      w[i]=(.5-.5*cos(x))*(.5-.5*cos(x));
+      x = MULT16_16_Q14(QCONST16(1.271903f,14), x);
+      tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(QCONST32(x,2))));
       if (inv)
-         w[i]=1-w[i];
-      w[i]=sqrt(w[i]);
+         tmp=SUB16(Q15_ONE,tmp);
+      w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15));
    }
 }
 
+      
+#ifdef FIXED_POINT
 /* This function approximates the gain function 
    y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)  
    which multiplied by xi/(1+xi) is the optimal gain
    in the loudness domain ( sqrt[amplitude] )
+   Input in Q11 format, output in Q15
 */
-static inline float hypergeom_gain(float x)
+static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
+{
+   int ind;
+   spx_word16_t frac;
+   /* Q13 table */
+   static const spx_word16_t table[21] = {
+       6730,  8357,  9868, 11267, 12563, 13770, 14898,
+      15959, 16961, 17911, 18816, 19682, 20512, 21311,
+      22082, 22827, 23549, 24250, 24931, 25594, 26241};
+      ind = SHR32(xx,10);
+      if (ind<0)
+         return Q15_ONE;
+      if (ind>19)
+         return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT))));
+      frac = SHL32(xx-SHL32(ind,10),5);
+      return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7);
+}
+
+static inline spx_word16_t qcurve(spx_word16_t x)
+{
+   x = MAX16(x, 1);
+   return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x))));
+}
+
+/* Compute the gain floor based on different floors for the background noise and residual echo */
+static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
+{
+   int i;
+   
+   if (noise_suppress > effective_echo_suppress)
+   {
+      spx_word16_t noise_gain, gain_ratio;
+      noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1)));
+      gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1)));
+
+      /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
+      for (i=0;i<len;i++)
+         gain_floor[i] = MULT16_16_Q15(noise_gain,
+                                       spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]),
+                                             (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
+   } else {
+      spx_word16_t echo_gain, gain_ratio;
+      echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1)));
+      gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1)));
+
+      /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
+      for (i=0;i<len;i++)
+         gain_floor[i] = MULT16_16_Q15(echo_gain,
+                                       spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i],
+                                             (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
+   }
+}
+
+#else
+/* This function approximates the gain function 
+   y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)  
+   which multiplied by xi/(1+xi) is the optimal gain
+   in the loudness domain ( sqrt[amplitude] )
+*/
+static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
 {
    int ind;
    float integer, frac;
+   float x;
    static const float table[21] = {
       0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
       1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
       2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
-      
-   integer = floor(2*x);
-   ind = (int)integer;
-   if (ind<0)
-      return 1;
-   if (ind>19)
-      return 1+.1296/x;
-   frac = 2*x-integer;
-   return ((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
+      x = EXPIN_SCALING_1*xx;
+      integer = floor(2*x);
+      ind = (int)integer;
+      if (ind<0)
+         return FRAC_SCALING;
+      if (ind>19)
+         return FRAC_SCALING*(1+.1296/x);
+      frac = 2*x-integer;
+      return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
 }
 
-static inline float qcurve(float x)
+static inline spx_word16_t qcurve(spx_word16_t x)
 {
-   return 1.f/(1.f+.1f/(x*x));
+   return 1.f/(1.f+.15f/(SNR_SCALING_1*x));
+}
+
+static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
+{
+   int i;
+   float echo_floor;
+   float noise_floor;
+
+   noise_floor = exp(.2302585f*noise_suppress);
+   echo_floor = exp(.2302585f*effective_echo_suppress);
+
+   /* Compute the gain floor based on different floors for the background noise and residual echo */
+   for (i=0;i<len;i++)
+      gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
 }
 
+#endif
 SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
 {
    int i;
-   int N, N3, N4;
+   int N, N3, N4, M;
 
    SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
    st->frame_size = frame_size;
@@ -153,49 +426,51 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r
    
    st->sampling_rate = sampling_rate;
    st->denoise_enabled = 1;
-   st->agc_enabled = 0;
-   st->agc_level = 8000;
    st->vad_enabled = 0;
    st->dereverb_enabled = 0;
-   st->reverb_decay = .5;
-   st->reverb_level = .2;
-
-   st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
-   st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
-
-   st->frame = (float*)speex_alloc(2*N*sizeof(float));
-   st->ps = (float*)speex_alloc(N*sizeof(float));
-   st->gain2 = (float*)speex_alloc(N*sizeof(float));
-   st->window = (float*)speex_alloc(2*N*sizeof(float));
-   st->noise = (float*)speex_alloc(N*sizeof(float));
-   st->reverb_estimate = (float*)speex_alloc(N*sizeof(float));
-   st->old_ps = (float*)speex_alloc(N*sizeof(float));
-   st->gain = (float*)speex_alloc(N*sizeof(float));
-   st->prior = (float*)speex_alloc(N*sizeof(float));
-   st->post = (float*)speex_alloc(N*sizeof(float));
-   st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
-   st->inbuf = (float*)speex_alloc(N3*sizeof(float));
-   st->outbuf = (float*)speex_alloc(N3*sizeof(float));
-   st->echo_noise = (float*)speex_alloc(N*sizeof(float));
-
-   st->S = (float*)speex_alloc(N*sizeof(float));
-   st->Smin = (float*)speex_alloc(N*sizeof(float));
-   st->Stmp = (float*)speex_alloc(N*sizeof(float));
-   st->update_prob = (float*)speex_alloc(N*sizeof(float));
+   st->reverb_decay = 0;
+   st->reverb_level = 0;
+   st->noise_suppress = NOISE_SUPPRESS_DEFAULT;
+   st->echo_suppress = ECHO_SUPPRESS_DEFAULT;
+   st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT;
 
-   st->zeta = (float*)speex_alloc(N*sizeof(float));
-   st->Zpeak = 0;
-   st->Zlast = 0;
+   st->speech_prob_start = SPEECH_PROB_START_DEFAULT;
+   st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT;
 
-   st->noise_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
-   st->noise_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
-   st->speech_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
-   st->speech_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
-   st->noise_bandsN = st->speech_bandsN = 1;
+   st->echo_state = NULL;
+   
+   st->nbands = NB_BANDS;
+   M = st->nbands;
+   st->bank = filterbank_new(M, sampling_rate, N, 1);
+   
+   st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
+   st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
+   st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
+   
+   st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+   st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+   st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+   st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+   st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+   st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
+   st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+   st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+   st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+   st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+   st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+   st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
+   
+   st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
+   st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
+   st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
+   st->update_prob = (int*)speex_alloc(N*sizeof(int));
+   
+   st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
+   st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
 
    conj_window(st->window, 2*N3);
    for (i=2*N3;i<2*st->ps_size;i++)
-      st->window[i]=1;
+      st->window[i]=Q15_ONE;
    
    if (N4>0)
    {
@@ -205,51 +480,62 @@ SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_r
          st->window[i+N3]=1;
       }
    }
-   for (i=0;i<N;i++)
+   for (i=0;i<N+M;i++)
    {
-      st->noise[i]=1e4;
-      st->reverb_estimate[i]=0.;
-      st->old_ps[i]=1e4;
-      st->gain[i]=1;
-      st->post[i]=1;
-      st->prior[i]=1;
+      st->noise[i]=QCONST32(1.f,NOISE_SHIFT);
+      st->reverb_estimate[i]=0;
+      st->old_ps[i]=1;
+      st->gain[i]=Q15_ONE;
+      st->post[i]=SHL16(1, SNR_SHIFT);
+      st->prior[i]=SHL16(1, SNR_SHIFT);
    }
 
+   for (i=0;i<N;i++)
+      st->update_prob[i] = 1;
    for (i=0;i<N3;i++)
    {
       st->inbuf[i]=0;
       st->outbuf[i]=0;
    }
-
+#ifndef FIXED_POINT
+   st->agc_enabled = 0;
+   st->agc_level = 8000;
+   st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
    for (i=0;i<N;i++)
    {
       float ff=((float)i)*.5*sampling_rate/((float)N);
+      /*st->loudness_weight[i] = .5f*(1.f/(1.f+ff/8000.f))+1.f*exp(-.5f*(ff-3800.f)*(ff-3800.f)/9e5f);*/
       st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
       if (st->loudness_weight[i]<.01f)
          st->loudness_weight[i]=.01f;
       st->loudness_weight[i] *= st->loudness_weight[i];
    }
-
-   st->speech_prob = 0;
-   st->last_speech = 1000;
-   st->loudness = pow(6000,LOUDNESS_EXP);
-   st->loudness2 = 6000;
+   /*st->loudness = pow(AMP_SCALE*st->agc_level,LOUDNESS_EXP);*/
+   st->loudness = 1e-15;
+   st->agc_gain = 1;
    st->nb_loudness_adapt = 0;
+   st->max_gain = 30;
+   st->max_increase_step = exp(0.11513f * 12.*st->frame_size / st->sampling_rate);
+   st->max_decrease_step = exp(-0.11513f * 40.*st->frame_size / st->sampling_rate);
+   st->prev_loudness = 1;
+   st->init_max = 1;
+#endif
+   st->was_speech = 0;
 
-   st->fft_lookup = (struct drft_lookup*)speex_alloc(sizeof(struct drft_lookup));
-   spx_drft_init(st->fft_lookup,2*N);
+   st->fft_lookup = spx_fft_init(2*N);
 
    st->nb_adapt=0;
-   st->consec_noise=0;
-   st->nb_preprocess=0;
+   st->min_count=0;
    return st;
 }
 
 void speex_preprocess_state_destroy(SpeexPreprocessState *st)
 {
    speex_free(st->frame);
+   speex_free(st->ft);
    speex_free(st->ps);
    speex_free(st->gain2);
+   speex_free(st->gain_floor);
    speex_free(st->window);
    speex_free(st->noise);
    speex_free(st->reverb_estimate);
@@ -257,8 +543,11 @@ void speex_preprocess_state_destroy(SpeexPreprocessState *st)
    speex_free(st->gain);
    speex_free(st->prior);
    speex_free(st->post);
+#ifndef FIXED_POINT
    speex_free(st->loudness_weight);
+#endif
    speex_free(st->echo_noise);
+   speex_free(st->residual_echo);
 
    speex_free(st->S);
    speex_free(st->Smin);
@@ -266,298 +555,65 @@ void speex_preprocess_state_destroy(SpeexPreprocessState *st)
    speex_free(st->update_prob);
    speex_free(st->zeta);
 
-   speex_free(st->noise_bands);
-   speex_free(st->noise_bands2);
-   speex_free(st->speech_bands);
-   speex_free(st->speech_bands2);
-
    speex_free(st->inbuf);
    speex_free(st->outbuf);
 
-   spx_drft_clear(st->fft_lookup);
-   speex_free(st->fft_lookup);
-
+   spx_fft_destroy(st->fft_lookup);
+   filterbank_destroy(st->bank);
    speex_free(st);
 }
 
-static void update_noise(SpeexPreprocessState *st, float *ps, spx_int32_t *echo)
+/* FIXME: The AGC doesn't work yet with fixed-point*/
+#ifndef FIXED_POINT
+static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft)
 {
    int i;
-   float beta;
-   st->nb_adapt++;
-   beta=1.0f/st->nb_adapt;
-   if (beta < .05f)
-      beta=.05f;
-   
-   if (!echo)
-   {
-      for (i=0;i<st->ps_size;i++)
-         st->noise[i] = (1.f-beta)*st->noise[i] + beta*ps[i];
-   } else {
-      for (i=0;i<st->ps_size;i++)
-         st->noise[i] = (1.f-beta)*st->noise[i] + beta*max(1.f,ps[i]-st->frame_size*st->frame_size*1.0*echo[i]); 
-#if 0
-      for (i=0;i<st->ps_size;i++)
-         st->noise[i] = 0;
-#endif
-   }
-}
-
-static int speex_compute_vad(SpeexPreprocessState *st, float *ps, float mean_prior, float mean_post)
-{
-   int i, is_speech=0;
    int N = st->ps_size;
-   float scale=.5f/N;
-
-   /* FIXME: Clean this up a bit */
-   {
-      float bands[NB_BANDS];
-      int j;
-      float p0, p1;
-      float tot_loudness=0;
-      float x = sqrt(mean_post);
-
-      for (i=5;i<N-10;i++)
-      {
-         tot_loudness += scale*st->ps[i] * st->loudness_weight[i];
-      }
-
-      for (i=0;i<NB_BANDS;i++)
-      {
-         bands[i]=1e4f;
-         for (j=i*N/NB_BANDS;j<(i+1)*N/NB_BANDS;j++)
-         {
-            bands[i] += ps[j];
-         }
-         bands[i]=log(bands[i]);
-      }
-      
-      /*p1 = .0005+.6*exp(-.5*(x-.4)*(x-.4)*11)+.1*exp(-1.2*x);
-      if (x<1.5)
-         p0=.1*exp(2*(x-1.5));
-      else
-         p0=.02+.1*exp(-.2*(x-1.5));
-      */
-
-      p0=1.f/(1.f+exp(3.f*(1.5f-x)));
-      p1=1.f-p0;
-
-      /*fprintf (stderr, "%f %f ", p0, p1);*/
-      /*p0 *= .99*st->speech_prob + .01*(1-st->speech_prob);
-      p1 *= .01*st->speech_prob + .99*(1-st->speech_prob);
-      
-      st->speech_prob = p0/(p1+p0);
-      */
-
-      if (st->noise_bandsN < 50 || st->speech_bandsN < 50)
-      {
-         if (mean_post > 5.f)
-         {
-            float adapt = 1./st->speech_bandsN++;
-            if (adapt<.005f)
-               adapt = .005f;
-            for (i=0;i<NB_BANDS;i++)
-            {
-               st->speech_bands[i] = (1.f-adapt)*st->speech_bands[i] + adapt*bands[i];
-               /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
-               st->speech_bands2[i] = (1.f-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
-            }
-         } else {
-            float adapt = 1./st->noise_bandsN++;
-            if (adapt<.005f)
-               adapt = .005f;
-            for (i=0;i<NB_BANDS;i++)
-            {
-               st->noise_bands[i] = (1.f-adapt)*st->noise_bands[i] + adapt*bands[i];
-               /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
-               st->noise_bands2[i] = (1.f-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
-            }
-         }
-      }
-      p0=p1=1;
-      for (i=0;i<NB_BANDS;i++)
-      {
-         float noise_var, speech_var;
-         float noise_mean, speech_mean;
-         float tmp1, tmp2, pr;
-
-         /*noise_var = 1.01*st->noise_bands2[i] - st->noise_bands[i]*st->noise_bands[i];
-           speech_var = 1.01*st->speech_bands2[i] - st->speech_bands[i]*st->speech_bands[i];*/
-         noise_var = st->noise_bands2[i];
-         speech_var = st->speech_bands2[i];
-         if (noise_var < .1f)
-            noise_var = .1f;
-         if (speech_var < .1f)
-            speech_var = .1f;
-         
-         /*speech_var = sqrt(speech_var*noise_var);
-           noise_var = speech_var;*/
-         if (noise_var < .05f*speech_var)
-            noise_var = .05f*speech_var; 
-         if (speech_var < .05f*noise_var)
-            speech_var = .05f*noise_var;
-         
-         if (bands[i] < st->noise_bands[i])
-            speech_var = noise_var;
-         if (bands[i] > st->speech_bands[i])
-            noise_var = speech_var;
-
-         speech_mean = st->speech_bands[i];
-         noise_mean = st->noise_bands[i];
-         if (noise_mean < speech_mean - 5.f)
-            noise_mean = speech_mean - 5.f;
-
-         tmp1 = exp(-.5f*(bands[i]-speech_mean)*(bands[i]-speech_mean)/speech_var)/sqrt(2.f*M_PI*speech_var);
-         tmp2 = exp(-.5f*(bands[i]-noise_mean)*(bands[i]-noise_mean)/noise_var)/sqrt(2.f*M_PI*noise_var);
-         /*fprintf (stderr, "%f ", (float)(p0/(.01+p0+p1)));*/
-         /*fprintf (stderr, "%f ", (float)(bands[i]));*/
-         pr = tmp1/(1e-25+tmp1+tmp2);
-         /*if (bands[i] < st->noise_bands[i])
-            pr=.01;
-         if (bands[i] > st->speech_bands[i] && pr < .995)
-         pr=.995;*/
-         if (pr>.999f)
-            pr=.999f;
-         if (pr<.001f)
-            pr=.001f;
-         /*fprintf (stderr, "%f ", pr);*/
-         p0 *= pr;
-         p1 *= (1-pr);
-      }
-
-      p0 = pow(p0,.2);
-      p1 = pow(p1,.2);      
-      
-#if 1
-      p0 *= 2.f;
-      p0=p0/(p1+p0);
-      if (st->last_speech>20) 
-      {
-         float tmp = sqrt(tot_loudness)/st->loudness2;
-         tmp = 1.f-exp(-10.f*tmp);
-         if (p0>tmp)
-            p0=tmp;
-      }
-      p1=1-p0;
-#else
-      if (sqrt(tot_loudness) < .6f*st->loudness2 && p0>15.f*p1)
-         p0=15.f*p1;
-      if (sqrt(tot_loudness) < .45f*st->loudness2 && p0>7.f*p1)
-         p0=7.f*p1;
-      if (sqrt(tot_loudness) < .3f*st->loudness2 && p0>3.f*p1)
-         p0=3.f*p1;
-      if (sqrt(tot_loudness) < .15f*st->loudness2 && p0>p1)
-         p0=p1;
-      /*fprintf (stderr, "%f %f ", (float)(sqrt(tot_loudness) /( .25*st->loudness2)), p0/(p1+p0));*/
-#endif
-
-      p0 *= .99f*st->speech_prob + .01f*(1-st->speech_prob);
-      p1 *= .01f*st->speech_prob + .99f*(1-st->speech_prob);
-      
-      st->speech_prob = p0/(1e-25f+p1+p0);
-      /*fprintf (stderr, "%f %f %f ", tot_loudness, st->loudness2, st->speech_prob);*/
-
-      if (st->speech_prob > st->speech_prob_start
-         || (st->last_speech < 20 && st->speech_prob > st->speech_prob_continue))
-      {
-         is_speech = 1;
-         st->last_speech = 0;
-      } else {
-         st->last_speech++;
-         if (st->last_speech<20)
-           is_speech = 1;
-      }
-
-      if (st->noise_bandsN > 50 && st->speech_bandsN > 50)
-      {
-         if (mean_post > 5)
-         {
-            float adapt = 1./st->speech_bandsN++;
-            if (adapt<.005f)
-               adapt = .005f;
-            for (i=0;i<NB_BANDS;i++)
-            {
-               st->speech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i];
-               /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
-               st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
-            }
-         } else {
-            float adapt = 1./st->noise_bandsN++;
-            if (adapt<.005f)
-               adapt = .005f;
-            for (i=0;i<NB_BANDS;i++)
-            {
-               st->noise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i];
-               /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
-               st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
-            }
-         }
-      }
-
-
-   }
-
-   return is_speech;
-}
-
-static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
-{
-   int i;
-   int N = st->ps_size;
-   float scale=.5f/N;
-   float agc_gain;
-   int freq_start, freq_end;
-   float active_bands = 0;
-
-   freq_start = (int)(300.0f*2*N/st->sampling_rate);
-   freq_end   = (int)(2000.0f*2*N/st->sampling_rate);
-   for (i=freq_start;i<freq_end;i++)
+   float target_gain;
+   float loudness=1.f;
+   float rate;
+   
+   for (i=2;i<N;i++)
    {
-      if (st->S[i] > 20.f*st->Smin[i]+1000.f)
-         active_bands+=1;
+      loudness += 2.f*N*st->ps[i]* st->loudness_weight[i];
    }
-   active_bands /= (freq_end-freq_start+1);
-
-   if (active_bands > .2f)
+   loudness=sqrt(loudness);
+      /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
+   loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
+   if (Pframe>.3f)
    {
-      float loudness=0.f;
-      float rate, rate2=.2f;
       st->nb_loudness_adapt++;
-      rate=2.0f/(1+st->nb_loudness_adapt);
-      if (rate < .05f)
-         rate = .05f;
-      if (rate < .1f && pow(loudness, LOUDNESS_EXP) > st->loudness)
-         rate = .1f;
-      if (rate < .2f && pow(loudness, LOUDNESS_EXP) > 3.f*st->loudness)
-         rate = .2f;
-      if (rate < .4f && pow(loudness, LOUDNESS_EXP) > 10.f*st->loudness)
-         rate = .4f;
-
-      for (i=2;i<N;i++)
-      {
-         loudness += scale*st->ps[i] * st->gain2[i] * st->gain2[i] * st->loudness_weight[i];
-      }
-      loudness=sqrt(loudness);
-      /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
-        loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
-      st->loudness = (1-rate)*st->loudness + (rate)*pow(loudness, LOUDNESS_EXP);
-      
-      st->loudness2 = (1-rate2)*st->loudness2 + rate2*pow(st->loudness, 1.0f/LOUDNESS_EXP);
-
-      loudness = pow(st->loudness, 1.0f/LOUDNESS_EXP);
-
-      /*fprintf (stderr, "%f %f %f\n", loudness, st->loudness2, rate);*/
+      /*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/
+      rate = .03*Pframe*Pframe;
+      st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP);
+      st->loudness_accum = (1-rate)*st->loudness_accum + rate;
+      if (st->init_max < st->max_gain && st->nb_adapt > 20)
+         st->init_max *= 1.f + .1f*Pframe*Pframe;
    }
+   /*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/
    
-   agc_gain = st->agc_level/st->loudness2;
-   /*fprintf (stderr, "%f %f %f %f\n", active_bands, st->loudness, st->loudness2, agc_gain);*/
-   if (agc_gain>200)
-      agc_gain = 200;
+   target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP);
 
-   for (i=0;i<N;i++)
-      st->gain2[i] *= agc_gain;
+   if ((Pframe>.5  && st->nb_adapt > 20) || target_gain < st->agc_gain)
+   {
+      if (target_gain > st->max_increase_step*st->agc_gain)
+         target_gain = st->max_increase_step*st->agc_gain;
+      if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness)
+         target_gain = st->max_decrease_step*st->agc_gain;
+      if (target_gain > st->max_gain)
+         target_gain = st->max_gain;
+      if (target_gain > st->init_max)
+         target_gain = st->init_max;
    
+      st->agc_gain = target_gain;
+   }
+   /*fprintf (stderr, "%f %f %f\n", loudness, (float)AMP_SCALE_1*pow(st->loudness, 1.0f/LOUDNESS_EXP), st->agc_gain);*/
+      
+   for (i=0;i<2*N;i++)
+      ft[i] *= st->agc_gain;
+   st->prev_loudness = loudness;
 }
+#endif
 
 static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
 {
@@ -565,7 +621,7 @@ static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
    int N = st->ps_size;
    int N3 = 2*N - st->frame_size;
    int N4 = st->frame_size - N3;
-   float *ps=st->ps;
+   spx_word32_t *ps=st->ps;
 
    /* 'Build' input frame */
    for (i=0;i<N3;i++)
@@ -579,295 +635,333 @@ static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
 
    /* Windowing */
    for (i=0;i<2*N;i++)
-      st->frame[i] *= st->window[i];
+      st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
 
+#ifdef FIXED_POINT
+   {
+      spx_word16_t max_val=0;
+      for (i=0;i<2*N;i++)
+         max_val = MAX16(max_val, ABS16(st->frame[i]));
+      st->frame_shift = 14-spx_ilog2(EXTEND32(max_val));
+      for (i=0;i<2*N;i++)
+         st->frame[i] = SHL16(st->frame[i], st->frame_shift);
+   }
+#endif
+   
    /* Perform FFT */
-   spx_drft_forward(st->fft_lookup, st->frame);
-
+   spx_fft(st->fft_lookup, st->frame, st->ft);
+         
    /* Power spectrum */
-   ps[0]=1;
+   ps[0]=MULT16_16(st->ft[0],st->ft[0]);
    for (i=1;i<N;i++)
-      ps[i]=1+st->frame[2*i-1]*st->frame[2*i-1] + st->frame[2*i]*st->frame[2*i];
+      ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]);
+   for (i=0;i<N;i++)
+      st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift);
 
+   filterbank_compute_bank32(st->bank, ps, ps+N);
 }
 
 static void update_noise_prob(SpeexPreprocessState *st)
 {
    int i;
+   int min_range;
    int N = st->ps_size;
 
    for (i=1;i<N-1;i++)
-      st->S[i] = 100.f+ .8f*st->S[i] + .05f*st->ps[i-1]+.1f*st->ps[i]+.05f*st->ps[i+1];
+      st->S[i] =  MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1]) 
+                      + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]);
+   st->S[0] =  MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]);
+   st->S[N-1] =  MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]);
    
-   if (st->nb_preprocess<1)
+   if (st->nb_adapt==1)
    {
-      for (i=1;i<N-1;i++)
-         st->Smin[i] = st->Stmp[i] = st->S[i]+100.f;
+      for (i=0;i<N;i++)
+         st->Smin[i] = st->Stmp[i] = 0;
    }
 
-   if (st->nb_preprocess%200==0)
+   if (st->nb_adapt < 100)
+      min_range = 15;
+   else if (st->nb_adapt < 1000)
+      min_range = 50;
+   else if (st->nb_adapt < 10000)
+      min_range = 150;
+   else
+      min_range = 300;
+   if (st->min_count > min_range)
    {
-      for (i=1;i<N-1;i++)
+      st->min_count = 0;
+      for (i=0;i<N;i++)
       {
-         st->Smin[i] = min(st->Stmp[i], st->S[i]);
+         st->Smin[i] = MIN32(st->Stmp[i], st->S[i]);
          st->Stmp[i] = st->S[i];
       }
    } else {
-      for (i=1;i<N-1;i++)
+      for (i=0;i<N;i++)
       {
-         st->Smin[i] = min(st->Smin[i], st->S[i]);
-         st->Stmp[i] = min(st->Stmp[i], st->S[i]);      
+         st->Smin[i] = MIN32(st->Smin[i], st->S[i]);
+         st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]);      
       }
    }
-   for (i=1;i<N-1;i++)
+   for (i=0;i<N;i++)
    {
-      st->update_prob[i] *= .2f;
-      if (st->S[i] > 2.5*st->Smin[i])
-         st->update_prob[i] += .8f;
+      if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > ADD32(st->Smin[i],EXTEND32(20)))
+         st->update_prob[i] = 1;
+      else
+         st->update_prob[i] = 0;
       /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
       /*fprintf (stderr, "%f ", st->update_prob[i]);*/
    }
 
 }
 
-#define NOISE_OVERCOMPENS 1.4
+#define NOISE_OVERCOMPENS 1.
+
+void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
 
 int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
 {
+   return speex_preprocess_run(st, x);
+}
+
+int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
+{
    int i;
-   int is_speech=1;
-   float mean_post=0;
-   float mean_prior=0;
+   int M;
    int N = st->ps_size;
    int N3 = 2*N - st->frame_size;
    int N4 = st->frame_size - N3;
-   float scale=.5f/N;
-   float *ps=st->ps;
-   float Zframe=0, Pframe;
-
+   spx_word32_t *ps=st->ps;
+   spx_word32_t Zframe;
+   spx_word16_t Pframe;
+   spx_word16_t beta, beta_1;
+   spx_word16_t effective_echo_suppress;
+   
+   st->nb_adapt++;
+   st->min_count++;
+   
+   beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
+   beta_1 = Q15_ONE-beta;
+   M = st->nbands;
+   /* Deal with residual echo if provided */
+   if (st->echo_state)
+   {
+      speex_echo_get_residual(st->echo_state, st->residual_echo, N);
+#ifndef FIXED_POINT
+      /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */
+      if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f))
+      {
+         for (i=0;i<N;i++)
+            st->residual_echo[i] = 0;
+      }
+#endif
+      for (i=0;i<N;i++)
+         st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]);
+      filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N);
+   } else {
+      for (i=0;i<N+M;i++)
+         st->echo_noise[i] = 0;
+   }
    preprocess_analysis(st, x);
 
    update_noise_prob(st);
 
-   st->nb_preprocess++;
-
-   /* Noise estimation always updated for the 20 first times */
-   if (st->nb_adapt<10)
+   /* Noise estimation always updated for the 10 first frames */
+   /*if (st->nb_adapt<10)
    {
-      update_noise(st, ps, echo);
+      for (i=1;i<N-1;i++)
+         st->update_prob[i] = 0;
    }
-
-   /* Deal with residual echo if provided */
-   if (echo)
-      for (i=1;i<N;i++)
-         st->echo_noise[i] = (.3f*st->echo_noise[i] + st->frame_size*st->frame_size*1.0*echo[i]);
-
-   /* Compute a posteriori SNR */
-   for (i=1;i<N;i++)
+   */
+   
+   /* Update the noise estimate for the frequencies where it can be */
+   for (i=0;i<N;i++)
    {
-      float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i];
-      st->post[i] = ps[i]/tot_noise - 1.f;
-      if (st->post[i]>100.f)
-         st->post[i]=100.f;
-      /*if (st->post[i]<0)
-        st->post[i]=0;*/
-      mean_post+=st->post[i];
+      if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT))
+         st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT)));
    }
-   mean_post /= N;
-   if (mean_post<0.f)
-      mean_post=0.f;
+   filterbank_compute_bank32(st->bank, st->noise, st->noise+N);
 
    /* Special case for first frame */
    if (st->nb_adapt==1)
-      for (i=1;i<N;i++)
+      for (i=0;i<N+M;i++)
          st->old_ps[i] = ps[i];
 
-   /* Compute a priori SNR */
-   {
-      /* A priori update rate */
-      for (i=1;i<N;i++)
-      {
-         float gamma = .15+.85*st->prior[i]*st->prior[i]/((1+st->prior[i])*(1+st->prior[i]));
-         float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i];
-         /* A priori SNR update */
-         st->prior[i] = gamma*max(0.0f,st->post[i]) +
-               (1.f-gamma)* (.8*st->gain[i]*st->gain[i]*st->old_ps[i]/tot_noise + .2*st->prior[i]);
-         
-         if (st->prior[i]>100.f)
-            st->prior[i]=100.f;
-         
-         mean_prior+=st->prior[i];
-      }
-   }
-   mean_prior /= N;
-
-#if 0
-   for (i=0;i<N;i++)
-   {
-      fprintf (stderr, "%f ", st->prior[i]);
-   }
-   fprintf (stderr, "\n");
-#endif
-   /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/
-
-   if (st->nb_preprocess>=20)
+   /* Compute a posteriori SNR */
+   for (i=0;i<N+M;i++)
    {
-      int do_update = 0;
-      float noise_ener=0, sig_ener=0;
-      /* If SNR is low (both a priori and a posteriori), update the noise estimate*/
-      /*if (mean_prior<.23 && mean_post < .5)*/
-      if (mean_prior<.23f && mean_post < .5f)
-         do_update = 1;
-      for (i=1;i<N;i++)
-      {
-         noise_ener += st->noise[i];
-         sig_ener += ps[i];
-      }
-      if (noise_ener > 3.f*sig_ener)
-         do_update = 1;
-      /*do_update = 0;*/
-      if (do_update)
-      {
-         st->consec_noise++;
-      } else {
-         st->consec_noise=0;
-      }
+      spx_word16_t gamma;
+      
+      /* Total noise estimate including residual echo and reverberation */
+      spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]);
+      
+      /* A posteriori SNR = ps/noise - 1*/
+      st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
+      st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT));
+      
+      /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */
+      gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise))));
+      
+      /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */
+      st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15));
+      st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT));
    }
 
-   if (st->vad_enabled)
-      is_speech = speex_compute_vad(st, ps, mean_prior, mean_post);
-
+   /*print_vec(st->post, N+M, "");*/
 
-   if (st->consec_noise>=3)
+   /* Recursive average of the a priori SNR. A bit smoothed for the psd components */
+   st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15);
+   for (i=1;i<N-1;i++)
+      st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])),
+                           MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15);
+   for (i=N-1;i<N+M;i++)
+      st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15);
+
+   /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */
+   Zframe = 0;
+   for (i=N;i<N+M;i++)
+      Zframe = ADD32(Zframe, EXTEND32(st->zeta[i]));
+   Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands)));
+   
+   effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15));
+   
+   compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
+         
+   /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale) 
+      Technically this is actually wrong because the EM gaim assumes a slightly different probability 
+      distribution */
+   for (i=N;i<N+M;i++)
    {
-      update_noise(st, st->old_ps, echo);
-   } else {
-      for (i=1;i<N-1;i++)
-      {
-         if (st->update_prob[i]<.5f/* || st->ps[i] < st->noise[i]*/)
-         {
-            if (echo)
-               st->noise[i] = .95f*st->noise[i] + .05f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*1.0*echo[i]);
-            else
-               st->noise[i] = .95f*st->noise[i] + .05f*st->ps[i];
-         }
-      }
-   }
+      /* See EM and Cohen papers*/
+      spx_word32_t theta;
+      /* Gain from hypergeometric function */
+      spx_word32_t MM;
+      /* Weiner filter gain */
+      spx_word16_t prior_ratio;
+      /* a priority probability of speech presence based on Bark sub-band alone */
+      spx_word16_t P1;
+      /* Speech absence a priori probability (considering sub-band and frame) */
+      spx_word16_t q;
+#ifdef FIXED_POINT
+      spx_word16_t tmp;
+#endif
+      
+      prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
+      theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
 
-   for (i=1;i<N;i++)
-   {
-      st->zeta[i] = .7f*st->zeta[i] + .3f*st->prior[i];
+      MM = hypergeom_gain(theta);
+      /* Gain with bound */
+      st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
+      /* Save old Bark power spectrum */
+      st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
+
+      P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i]));
+      q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
+#ifdef FIXED_POINT
+      theta = MIN32(theta, EXTEND32(32767));
+/*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
+      tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/
+/*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
+      st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
+#else
+      st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
+#endif
    }
-
+   /* Convert the EM gains and speech prob to linear frequency */
+   filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
+   filterbank_compute_psd16(st->bank,st->gain+N, st->gain);
+   
+   /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */
+   if (1)
    {
-      int freq_start = (int)(300.0f*2.f*N/st->sampling_rate);
-      int freq_end   = (int)(2000.0f*2.f*N/st->sampling_rate);
-      for (i=freq_start;i<freq_end;i++)
+      filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor);
+   
+      /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */
+      for (i=0;i<N;i++)
       {
-         Zframe += st->zeta[i];         
-      }
-      Zframe /= (freq_end-freq_start);
-   }
-   st->Zlast = Zframe;
-
-   Pframe = qcurve(Zframe);
+         spx_word32_t MM;
+         spx_word32_t theta;
+         spx_word16_t prior_ratio;
+         spx_word16_t tmp;
+         spx_word16_t p;
+         spx_word16_t g;
+         
+         /* Wiener filter gain */
+         prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
+         theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
+
+         /* Optimal estimator for loudness domain */
+         MM = hypergeom_gain(theta);
+         /* EM gain with bound */
+         g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
+         /* Interpolated speech probability of presence */
+         p = st->gain2[i];
+                  
+         /* Constrain the gain to be close to the Bark scale gain */
+         if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i])
+            g = MULT16_16(3,st->gain[i]);
+         st->gain[i] = g;
+         
+         /* Save old power spectrum */
+         st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
+         
+         /* Apply gain floor */
+         if (st->gain[i] < st->gain_floor[i])
+            st->gain[i] = st->gain_floor[i];
 
-   /*fprintf (stderr, "%f\n", Pframe);*/
-   /* Compute gain according to the Ephraim-Malah algorithm */
-   for (i=1;i<N;i++)
-   {
-      float MM;
-      float theta;
-      float prior_ratio;
-      float p, q;
-      float zeta1;
-      float P1;
-
-      prior_ratio = st->prior[i]/(1.0001f+st->prior[i]);
-      theta = (1.f+st->post[i])*prior_ratio;
-
-      if (i==1 || i==N-1)
-         zeta1 = st->zeta[i];
-      else
-         zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1];
-      P1 = qcurve (zeta1);
-      
-      /* FIXME: add global prob (P2) */
-      q = 1-Pframe*P1;
-      q = 1-P1;
-      if (q>.95f)
-         q=.95f;
-      p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta));
-      /*p=1;*/
-
-      /* Optimal estimator for loudness domain */
-      MM = hypergeom_gain(theta);
+         /* Exponential decay model for reverberation (unused) */
+         /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/
+         
+         /* Take into account speech probability of presence (loudness domain MMSE estimator) */
+         /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */
+         tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
+         st->gain2[i]=SQR16_Q15(tmp);
 
-      st->gain[i] = prior_ratio * MM;
-      /*Put some (very arbitraty) limit on the gain*/
-      if (st->gain[i]>2.f)
-      {
-         st->gain[i]=2.f;
+         /* Use this if you want a log-domain MMSE estimator instead */
+         /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/
       }
-      
-      st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];
-      if (st->denoise_enabled)
+   } else {
+      for (i=N;i<N+M;i++)
       {
-         /*st->gain2[i] = p*p*st->gain[i];*/
-         st->gain2[i]=(p*sqrt(st->gain[i])+.2*(1-p)) * (p*sqrt(st->gain[i])+.2*(1-p));
-         /*st->gain2[i] = pow(st->gain[i], p) * pow(.1f,1.f-p);*/
-      } else {
-         st->gain2[i]=1.f;
+         spx_word16_t tmp;
+         spx_word16_t p = st->gain2[i];
+         st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]);         
+         tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
+         st->gain2[i]=SQR16_Q15(tmp);
       }
+      filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
    }
    
-   st->gain2[0]=st->gain[0]=0.f;
-   st->gain2[N-1]=st->gain[N-1]=0.f;
-   /*
-   for (i=30;i<N-2;i++)
+   /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */
+   if (!st->denoise_enabled)
    {
-      st->gain[i] = st->gain2[i]*st->gain2[i] + (1-st->gain2[i])*.333*(.6*st->gain2[i-1]+st->gain2[i]+.6*st->gain2[i+1]+.4*st->gain2[i-2]+.4*st->gain2[i+2]);
-   }
-   for (i=30;i<N-2;i++)
-      st->gain2[i] = st->gain[i];
-   */
-   if (st->agc_enabled)
-      speex_compute_agc(st, mean_prior);
-
-#if 0
-   if (!is_speech)
-   {
-      for (i=0;i<N;i++)
-         st->gain2[i] = 0;
+      for (i=0;i<N+M;i++)
+         st->gain2[i]=Q15_ONE;
    }
-#if 0
- else {
-      for (i=0;i<N;i++)
-         st->gain2[i] = 1;
-   }
-#endif
-#endif
-
+      
    /* Apply computed gain */
    for (i=1;i<N;i++)
    {
-      st->frame[2*i-1] *= st->gain2[i];
-      st->frame[2*i] *= st->gain2[i];
+      st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]);
+      st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]);
    }
-
-   /* Get rid of the DC and very low frequencies */
-   st->frame[0]=0;
-   st->frame[1]=0;
-   st->frame[2]=0;
-   /* Nyquist frequency is mostly useless too */
-   st->frame[2*N-1]=0;
+   st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]);
+   st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]);
+   
+   /*FIXME: This *will* not work for fixed-point */
+#ifndef FIXED_POINT
+   if (st->agc_enabled)
+      speex_compute_agc(st, Pframe, st->ft);
+#endif
 
    /* Inverse FFT with 1/N scaling */
-   spx_drft_backward(st->fft_lookup, st->frame);
-
+   spx_ifft(st->fft_lookup, st->ft, st->frame);
+   /* Scale back to original (lower) amplitude */
    for (i=0;i<2*N;i++)
-      st->frame[i] *= scale;
+      st->frame[i] = PSHR16(st->frame[i], st->frame_shift);
 
+   /*FIXME: This *will* not work for fixed-point */
+#ifndef FIXED_POINT
+   if (st->agc_enabled)
    {
       float max_sample=0;
       for (i=0;i<2*N;i++)
@@ -880,9 +974,11 @@ int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo
             st->frame[i] *= damp;
       }
    }
-
+#endif
+   
+   /* Synthesis window (for WOLA) */
    for (i=0;i<2*N;i++)
-      st->frame[i] *= st->window[i];
+      st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
 
    /* Perform overlap and add */
    for (i=0;i<N3;i++)
@@ -894,47 +990,55 @@ int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo
    for (i=0;i<N3;i++)
       st->outbuf[i] = st->frame[st->frame_size+i];
 
-   /* Save old power spectrum */
-   for (i=1;i<N;i++)
-      st->old_ps[i] = ps[i];
-
-   return is_speech;
+   /* FIXME: This VAD is a kludge */
+   if (st->vad_enabled)
+   {
+      if (Pframe > st->speech_prob_start || (st->was_speech && Pframe > st->speech_prob_continue))
+      {
+         st->was_speech=1;
+         return 1;
+      } else
+      {
+         st->was_speech=0;
+         return 0;
+      }
+   } else {
+      return 1;
+   }
 }
 
-void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
+void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x)
 {
    int i;
    int N = st->ps_size;
    int N3 = 2*N - st->frame_size;
+   int M;
+   spx_word32_t *ps=st->ps;
 
-   float *ps=st->ps;
-
+   M = st->nbands;
+   st->min_count++;
+   
    preprocess_analysis(st, x);
 
    update_noise_prob(st);
-
-   st->nb_preprocess++;
    
    for (i=1;i<N-1;i++)
    {
-      if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i])
+      if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT))
       {
-         if (echo)
-            st->noise[i] = .95f*st->noise[i] + .1f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*1.0*echo[i]);
-         else
-            st->noise[i] = .95f*st->noise[i] + .1f*st->ps[i];
+         st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT));
       }
    }
 
    for (i=0;i<N3;i++)
-      st->outbuf[i] = x[st->frame_size-N3+i]*st->window[st->frame_size+i];
+      st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]);
 
    /* Save old power spectrum */
-   for (i=1;i<N;i++)
+   for (i=0;i<N+M;i++)
       st->old_ps[i] = ps[i];
 
-   for (i=1;i<N;i++)
-      st->reverb_estimate[i] *= st->reverb_decay;
+   for (i=0;i<N;i++)
+      st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]);
 }
 
 
@@ -946,17 +1050,17 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
    switch(request)
    {
    case SPEEX_PREPROCESS_SET_DENOISE:
-      st->denoise_enabled = (*(int*)ptr);
+      st->denoise_enabled = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_PREPROCESS_GET_DENOISE:
-      (*(int*)ptr) = st->denoise_enabled;
+      (*(spx_int32_t*)ptr) = st->denoise_enabled;
       break;
-
+#ifndef FIXED_POINT
    case SPEEX_PREPROCESS_SET_AGC:
-      st->agc_enabled = (*(int*)ptr);
+      st->agc_enabled = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_PREPROCESS_GET_AGC:
-      (*(int*)ptr) = st->agc_enabled;
+      (*(spx_int32_t*)ptr) = st->agc_enabled;
       break;
 
    case SPEEX_PREPROCESS_SET_AGC_LEVEL:
@@ -969,21 +1073,40 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
    case SPEEX_PREPROCESS_GET_AGC_LEVEL:
       (*(float*)ptr) = st->agc_level;
       break;
-
+   case SPEEX_PREPROCESS_SET_AGC_INCREMENT:
+      st->max_increase_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
+      break;
+   case SPEEX_PREPROCESS_GET_AGC_INCREMENT:
+      (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_increase_step)*st->sampling_rate/st->frame_size);
+      break;
+   case SPEEX_PREPROCESS_SET_AGC_DECREMENT:
+      st->max_decrease_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
+      break;
+   case SPEEX_PREPROCESS_GET_AGC_DECREMENT:
+      (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_decrease_step)*st->sampling_rate/st->frame_size);
+      break;
+   case SPEEX_PREPROCESS_SET_AGC_MAX_GAIN:
+      st->max_gain = exp(0.11513f * (*(spx_int32_t*)ptr));
+      break;
+   case SPEEX_PREPROCESS_GET_AGC_MAX_GAIN:
+      (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_gain));
+      break;
+#endif
    case SPEEX_PREPROCESS_SET_VAD:
-      st->vad_enabled = (*(int*)ptr);
+      speex_warning("The VAD has been replaced by a hack pending a complete rewrite");
+      st->vad_enabled = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_PREPROCESS_GET_VAD:
-      (*(int*)ptr) = st->vad_enabled;
+      (*(spx_int32_t*)ptr) = st->vad_enabled;
       break;
    
    case SPEEX_PREPROCESS_SET_DEREVERB:
-      st->dereverb_enabled = (*(int*)ptr);
+      st->dereverb_enabled = (*(spx_int32_t*)ptr);
       for (i=0;i<st->ps_size;i++)
          st->reverb_estimate[i]=0;
       break;
    case SPEEX_PREPROCESS_GET_DEREVERB:
-      (*(int*)ptr) = st->dereverb_enabled;
+      (*(spx_int32_t*)ptr) = st->dereverb_enabled;
       break;
 
    case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
@@ -1001,24 +1124,47 @@ int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
       break;
 
    case SPEEX_PREPROCESS_SET_PROB_START:
-      st->speech_prob_start = (*(int*)ptr) / 100.0;
-      if ( st->speech_prob_start > 1 || st->speech_prob_start < 0 )
-         st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
+      *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr));
+      st->speech_prob_start = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100);
       break;
    case SPEEX_PREPROCESS_GET_PROB_START:
-      (*(int*)ptr) = st->speech_prob_start * 100;
+      (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100);
       break;
 
    case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
-      st->speech_prob_continue = (*(int*)ptr) / 100.0;
-      if ( st->speech_prob_continue > 1 || st->speech_prob_continue < 0 )
-         st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
+      *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr));
+      st->speech_prob_continue = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100);
       break;
    case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
-      (*(int*)ptr) = st->speech_prob_continue * 100;
+      (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100);
+      break;
+
+   case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS:
+      st->noise_suppress = -ABS(*(spx_int32_t*)ptr);
+      break;
+   case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS:
+      (*(spx_int32_t*)ptr) = st->noise_suppress;
+      break;
+   case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS:
+      st->echo_suppress = -ABS(*(spx_int32_t*)ptr);
+      break;
+   case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS:
+      (*(spx_int32_t*)ptr) = st->echo_suppress;
+      break;
+   case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE:
+      st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr);
+      break;
+   case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE:
+      (*(spx_int32_t*)ptr) = st->echo_suppress_active;
+      break;
+   case SPEEX_PREPROCESS_SET_ECHO_STATE:
+      st->echo_state = (SpeexEchoState*)ptr;
+      break;
+   case SPEEX_PREPROCESS_GET_ECHO_STATE:
+      ptr = (void*)st->echo_state;
       break;
 
-      default:
+   default:
       speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
       return -1;
    }
diff --git a/libspeex/pseudofloat.h b/libspeex/pseudofloat.h
index 9ff1b75..a6c4762 100644
--- a/libspeex/pseudofloat.h
+++ b/libspeex/pseudofloat.h
@@ -2,6 +2,15 @@
 /**
    @file pseudofloat.h
    @brief Pseudo-floating point
+ * This header file provides a lightweight floating point type for
+ * use on fixed-point platforms when a large dynamic range is 
+ * required. The new type is not compatible with the 32-bit IEEE format,
+ * it is not even remotely as accurate as 32-bit floats, and is not
+ * even guaranteed to produce even remotely correct results for code
+ * other than Speex. It makes all kinds of shortcuts that are acceptable
+ * for Speex, but may not be acceptable for your application. You're
+ * quite welcome to reuse this code and improve it, but don't assume
+ * it works out of the box. Most likely, it doesn't.
  */
 /*
    Redistribution and use in source and binary forms, with or without
@@ -65,18 +74,8 @@ static inline spx_float_t PSEUDOFLOAT(spx_int32_t x)
       spx_float_t r = {0,0};
       return r;
    }
-   while (x>32767)
-   {
-      x >>= 1;
-      /*x *= .5;*/
-      e++;
-   }
-   while (x<16383)
-   {
-      x <<= 1;
-      /*x *= 2;*/
-      e--;
-   }
+   e = spx_ilog2(ABS32(x))-14;
+   x = VSHR32(x, e);
    if (sign)
    {
       spx_float_t r;
@@ -167,9 +166,9 @@ static inline spx_float_t FLOAT_SUB(spx_float_t a, spx_float_t b)
 static inline int FLOAT_LT(spx_float_t a, spx_float_t b)
 {
    if (a.m==0)
-      return b.m<0;
+      return b.m>0;
    else if (b.m==0)
-      return a.m>0;   
+      return a.m<0;   
    if ((a).e > (b).e)
       return ((a).m>>1) < ((b).m>>MIN(15,(a).e-(b).e+1));
    else 
@@ -205,6 +204,14 @@ static inline spx_float_t FLOAT_MULT(spx_float_t a, spx_float_t b)
    return r;   
 }
 
+static inline spx_float_t FLOAT_AMULT(spx_float_t a, spx_float_t b)
+{
+   spx_float_t r;
+   r.m = (spx_int16_t)((spx_int32_t)(a).m*(b).m>>15);
+   r.e = (a).e+(b).e+15;
+   return r;   
+}
+
 
 static inline spx_float_t FLOAT_SHL(spx_float_t a, int b)
 {
@@ -217,68 +224,53 @@ static inline spx_float_t FLOAT_SHL(spx_float_t a, int b)
 static inline spx_int16_t FLOAT_EXTRACT16(spx_float_t a)
 {
    if (a.e<0)
-      return EXTRACT16((EXTEND32(a.m)+(1<<(-a.e-1)))>>-a.e);
+      return EXTRACT16((EXTEND32(a.m)+(EXTEND32(1)<<(-a.e-1)))>>-a.e);
    else
       return a.m<<a.e;
 }
 
-static inline spx_int32_t FLOAT_MUL32(spx_float_t a, spx_word32_t b)
+static inline spx_int32_t FLOAT_EXTRACT32(spx_float_t a)
 {
-   if (a.e<-15)
-      return SHR32(MULT16_32_Q15(a.m, b),-a.e-15);
+   if (a.e<0)
+      return (EXTEND32(a.m)+(EXTEND32(1)<<(-a.e-1)))>>-a.e;
    else
-      return SHL32(MULT16_32_Q15(a.m, b),15+a.e);
+      return EXTEND32(a.m)<<a.e;
+}
+
+static inline spx_int32_t FLOAT_MUL32(spx_float_t a, spx_word32_t b)
+{
+   return VSHR32(MULT16_32_Q15(a.m, b),-a.e-15);
 }
 
 static inline spx_float_t FLOAT_MUL32U(spx_word32_t a, spx_word32_t b)
 {
-   int e=0;
+   int e1, e2;
    spx_float_t r;
-   /* FIXME: Handle the sign */
-   if (a==0)
+   if (a==0 || b==0)
    {
       return FLOAT_ZERO;
    }
-   while (a>32767)
-   {
-      a >>= 1;
-      e++;
-   }
-   while (a<16384)
-   {
-      a <<= 1;
-      e--;
-   }
-   while (b>32767)
-   {
-      b >>= 1;
-      e++;
-   }
-   while (b<16384)
-   {
-      b <<= 1;
-      e--;
-   }
+   e1 = spx_ilog2(ABS32(a));
+   a = VSHR32(a, e1-14);
+   e2 = spx_ilog2(ABS32(b));
+   b = VSHR32(b, e2-14);
    r.m = MULT16_16_Q15(a,b);
-   r.e = e+15;
+   r.e = e1+e2-13;
    return r;
 }
 
+/* Do NOT attempt to divide by a negative number */
 static inline spx_float_t FLOAT_DIV32_FLOAT(spx_word32_t a, spx_float_t b)
 {
    int e=0;
    spx_float_t r;
-   /* FIXME: Handle the sign */
    if (a==0)
    {
       return FLOAT_ZERO;
    }
-   while (a<SHL32(EXTEND32(b.m),14))
-   {
-      a <<= 1;
-      e--;
-   }
-   while (a>=SHL32(EXTEND32(b.m-1),15))
+   e = spx_ilog2(ABS32(a))-spx_ilog2(b.m-1)-15;
+   a = VSHR32(a, e);
+   if (ABS32(a)>=SHL32(EXTEND32(b.m-1),15))
    {
       a >>= 1;
       e++;
@@ -289,41 +281,47 @@ static inline spx_float_t FLOAT_DIV32_FLOAT(spx_word32_t a, spx_float_t b)
 }
 
 
+/* Do NOT attempt to divide by a negative number */
 static inline spx_float_t FLOAT_DIV32(spx_word32_t a, spx_word32_t b)
 {
-   int e=0;
+   int e0=0,e=0;
    spx_float_t r;
-   /* FIXME: Handle the sign */
    if (a==0)
    {
       return FLOAT_ZERO;
    }
-   while (b>32767)
+   if (b>32767)
    {
-      b >>= 1;
-      e--;
+      e0 = spx_ilog2(b)-14;
+      b = VSHR32(b, e0);
+      e0 = -e0;
    }
-   while (a<SHL32(b,14))
-   {
-      a <<= 1;
-      e--;
-   }
-   while (a>=SHL32(b-1,15))
+   e = spx_ilog2(ABS32(a))-spx_ilog2(b-1)-15;
+   a = VSHR32(a, e);
+   if (ABS32(a)>=SHL32(EXTEND32(b-1),15))
    {
       a >>= 1;
       e++;
    }
+   e += e0;
    r.m = DIV32_16(a,b);
    r.e = e;
    return r;
 }
 
+/* Do NOT attempt to divide by a negative number */
 static inline spx_float_t FLOAT_DIVU(spx_float_t a, spx_float_t b)
 {
    int e=0;
    spx_int32_t num;
    spx_float_t r;
+   if (b.m<=0)
+   {
+      speex_warning_int("Attempted to divide by", b.m);
+      return FLOAT_ONE;
+   }
    num = a.m;
+   a.m = ABS16(a.m);
    while (a.m >= b.m)
    {
       e++;
@@ -339,7 +337,7 @@ static inline spx_float_t FLOAT_SQRT(spx_float_t a)
 {
    spx_float_t r;
    spx_int32_t m;
-   m = a.m << 14;
+   m = SHL32(EXTEND32(a.m), 14);
    r.e = a.e - 14;
    if (r.e & 1)
    {
@@ -359,9 +357,11 @@ static inline spx_float_t FLOAT_SQRT(spx_float_t a)
 #define FLOAT_HALF 0.5f
 #define PSEUDOFLOAT(x) (x)
 #define FLOAT_MULT(a,b) ((a)*(b))
+#define FLOAT_AMULT(a,b) ((a)*(b))
 #define FLOAT_MUL32(a,b) ((a)*(b))
 #define FLOAT_DIV32(a,b) ((a)/(b))
 #define FLOAT_EXTRACT16(a) (a)
+#define FLOAT_EXTRACT32(a) (a)
 #define FLOAT_ADD(a,b) ((a)+(b))
 #define FLOAT_SUB(a,b) ((a)-(b))
 #define REALFLOAT(x) (x)
diff --git a/libspeex/quant_lsp.c b/libspeex/quant_lsp.c
index bfca587..d907b98 100644
--- a/libspeex/quant_lsp.c
+++ b/libspeex/quant_lsp.c
@@ -417,7 +417,7 @@ void lsp_quant_48k(spx_lsp_t *lsp, spx_lsp_t *qlsp, int order, SpeexBits *bits)
 
 #ifdef FIXED_POINT
    for (i=0;i<order;i++)
-      qlsp[i]=PSHR(qlsp[i],2);
+      qlsp[i]=PSHR16(qlsp[i],2);
 #else
    for (i=0;i<order;i++)
       qlsp[i]=qlsp[i]*0.00097655;
diff --git a/libspeex/resample.c b/libspeex/resample.c
new file mode 100644
index 0000000..7135a29
--- /dev/null
+++ b/libspeex/resample.c
@@ -0,0 +1,1062 @@
+/* Copyright (C) 2007 Jean-Marc Valin
+      
+   File: resample.c
+   Arbitrary resampling code
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+   The design goals of this code are:
+      - Very fast algorithm
+      - SIMD-friendly algorithm
+      - Low memory requirement
+      - Good *perceptual* quality (and not best SNR)
+
+   The code is working, but it's in a very early stage, so it may have
+   artifacts, noise or subliminal messages from satan. Also, the API 
+   isn't stable and I can actually promise that I *will* change the API
+   some time in the future.
+
+TODO list:
+      - Variable calculation resolution depending on quality setting
+         - Single vs double in float mode
+         - 16-bit vs 32-bit (sinc only) in fixed-point mode
+      - Make sure the filter update works even when changing params 
+             after only a few samples procesed
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef OUTSIDE_SPEEX
+#include <stdlib.h>
+static void *speex_alloc (int size) {return calloc(size,1);}
+static void *speex_realloc (void *ptr, int size) {return realloc(ptr, size);}
+static void speex_free (void *ptr) {free(ptr);}
+#include "speex_resampler.h"
+#include "arch.h"
+#else /* OUTSIDE_SPEEX */
+               
+#include "speex/speex_resampler.h"
+#include "misc.h"
+#endif /* OUTSIDE_SPEEX */
+
+#include <math.h>
+
+#ifndef M_PI
+#define M_PI 3.14159263
+#endif
+
+#ifdef FIXED_POINT
+#define WORD2INT(x) ((x) < -32767 ? -32768 : ((x) > 32766 ? 32767 : (x)))  
+#else
+#define WORD2INT(x) ((x) < -32767.5f ? -32768 : ((x) > 32766.5f ? 32767 : floor(.5+(x))))  
+#endif
+               
+/*#define float double*/
+#define FILTER_SIZE 64
+#define OVERSAMPLE 8
+
+#define IMAX(a,b) ((a) > (b) ? (a) : (b))
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+typedef int (*resampler_basic_func)(SpeexResamplerState *, spx_uint32_t , const spx_word16_t *, spx_uint32_t *, spx_word16_t *, spx_uint32_t *);
+
+struct SpeexResamplerState_ {
+   spx_uint32_t in_rate;
+   spx_uint32_t out_rate;
+   spx_uint32_t num_rate;
+   spx_uint32_t den_rate;
+   
+   int    quality;
+   spx_uint32_t nb_channels;
+   spx_uint32_t filt_len;
+   spx_uint32_t mem_alloc_size;
+   int          int_advance;
+   int          frac_advance;
+   float  cutoff;
+   spx_uint32_t oversample;
+   int          initialised;
+   int          started;
+   
+   /* These are per-channel */
+   spx_int32_t  *last_sample;
+   spx_uint32_t *samp_frac_num;
+   spx_uint32_t *magic_samples;
+   
+   spx_word16_t *mem;
+   spx_word16_t *sinc_table;
+   spx_uint32_t sinc_table_length;
+   resampler_basic_func resampler_ptr;
+         
+   int    in_stride;
+   int    out_stride;
+} ;
+
+static double kaiser12_table[68] = {
+   0.99859849, 1.00000000, 0.99859849, 0.99440475, 0.98745105, 0.97779076,
+   0.96549770, 0.95066529, 0.93340547, 0.91384741, 0.89213598, 0.86843014,
+   0.84290116, 0.81573067, 0.78710866, 0.75723148, 0.72629970, 0.69451601,
+   0.66208321, 0.62920216, 0.59606986, 0.56287762, 0.52980938, 0.49704014,
+   0.46473455, 0.43304576, 0.40211431, 0.37206735, 0.34301800, 0.31506490,
+   0.28829195, 0.26276832, 0.23854851, 0.21567274, 0.19416736, 0.17404546,
+   0.15530766, 0.13794294, 0.12192957, 0.10723616, 0.09382272, 0.08164178,
+   0.07063950, 0.06075685, 0.05193064, 0.04409466, 0.03718069, 0.03111947,
+   0.02584161, 0.02127838, 0.01736250, 0.01402878, 0.01121463, 0.00886058,
+   0.00691064, 0.00531256, 0.00401805, 0.00298291, 0.00216702, 0.00153438,
+   0.00105297, 0.00069463, 0.00043489, 0.00025272, 0.00013031, 0.0000527734,
+   0.00001000, 0.00000000};
+/*
+static double kaiser12_table[36] = {
+   0.99440475, 1.00000000, 0.99440475, 0.97779076, 0.95066529, 0.91384741,
+   0.86843014, 0.81573067, 0.75723148, 0.69451601, 0.62920216, 0.56287762,
+   0.49704014, 0.43304576, 0.37206735, 0.31506490, 0.26276832, 0.21567274,
+   0.17404546, 0.13794294, 0.10723616, 0.08164178, 0.06075685, 0.04409466,
+   0.03111947, 0.02127838, 0.01402878, 0.00886058, 0.00531256, 0.00298291,
+   0.00153438, 0.00069463, 0.00025272, 0.0000527734, 0.00000500, 0.00000000};
+*/
+static double kaiser10_table[36] = {
+   0.99537781, 1.00000000, 0.99537781, 0.98162644, 0.95908712, 0.92831446,
+   0.89005583, 0.84522401, 0.79486424, 0.74011713, 0.68217934, 0.62226347,
+   0.56155915, 0.50119680, 0.44221549, 0.38553619, 0.33194107, 0.28205962,
+   0.23636152, 0.19515633, 0.15859932, 0.12670280, 0.09935205, 0.07632451,
+   0.05731132, 0.04193980, 0.02979584, 0.02044510, 0.01345224, 0.00839739,
+   0.00488951, 0.00257636, 0.00115101, 0.00035515, 0.00000000, 0.00000000};
+
+static double kaiser8_table[36] = {
+   0.99635258, 1.00000000, 0.99635258, 0.98548012, 0.96759014, 0.94302200,
+   0.91223751, 0.87580811, 0.83439927, 0.78875245, 0.73966538, 0.68797126,
+   0.63451750, 0.58014482, 0.52566725, 0.47185369, 0.41941150, 0.36897272,
+   0.32108304, 0.27619388, 0.23465776, 0.19672670, 0.16255380, 0.13219758,
+   0.10562887, 0.08273982, 0.06335451, 0.04724088, 0.03412321, 0.02369490,
+   0.01563093, 0.00959968, 0.00527363, 0.00233883, 0.00050000, 0.00000000};
+   
+static double kaiser6_table[36] = {
+   0.99733006, 1.00000000, 0.99733006, 0.98935595, 0.97618418, 0.95799003,
+   0.93501423, 0.90755855, 0.87598009, 0.84068475, 0.80211977, 0.76076565,
+   0.71712752, 0.67172623, 0.62508937, 0.57774224, 0.53019925, 0.48295561,
+   0.43647969, 0.39120616, 0.34752997, 0.30580127, 0.26632152, 0.22934058,
+   0.19505503, 0.16360756, 0.13508755, 0.10953262, 0.08693120, 0.06722600,
+   0.05031820, 0.03607231, 0.02432151, 0.01487334, 0.00752000, 0.00000000};
+
+struct FuncDef {
+   double *table;
+   int oversample;
+};
+      
+static struct FuncDef _KAISER12 = {kaiser12_table, 64};
+#define KAISER12 (&_KAISER12)
+/*static struct FuncDef _KAISER12 = {kaiser12_table, 32};
+#define KAISER12 (&_KAISER12)*/
+static struct FuncDef _KAISER10 = {kaiser10_table, 32};
+#define KAISER10 (&_KAISER10)
+static struct FuncDef _KAISER8 = {kaiser8_table, 32};
+#define KAISER8 (&_KAISER8)
+static struct FuncDef _KAISER6 = {kaiser6_table, 32};
+#define KAISER6 (&_KAISER6)
+
+struct QualityMapping {
+   int base_length;
+   int oversample;
+   float downsample_bandwidth;
+   float upsample_bandwidth;
+   struct FuncDef *window_func;
+};
+
+
+/* This table maps conversion quality to internal parameters. There are two
+   reasons that explain why the up-sampling bandwidth is larger than the 
+   down-sampling bandwidth:
+   1) When up-sampling, we can assume that the spectrum is already attenuated
+      close to the Nyquist rate (from an A/D or a previous resampling filter)
+   2) Any aliasing that occurs very close to the Nyquist rate will be masked
+      by the sinusoids/noise just below the Nyquist rate (guaranteed only for
+      up-sampling).
+*/
+static const struct QualityMapping quality_map[11] = {
+   {  8,  4, 0.830f, 0.860f, KAISER6 }, /* Q0 */
+   { 16,  4, 0.850f, 0.880f, KAISER6 }, /* Q1 */
+   { 32,  4, 0.882f, 0.910f, KAISER6 }, /* Q2 */  /* 82.3% cutoff ( ~60 dB stop) 6  */
+   { 48,  8, 0.895f, 0.917f, KAISER8 }, /* Q3 */  /* 84.9% cutoff ( ~80 dB stop) 8  */
+   { 64,  8, 0.921f, 0.940f, KAISER8 }, /* Q4 */  /* 88.7% cutoff ( ~80 dB stop) 8  */
+   { 80, 16, 0.922f, 0.940f, KAISER10}, /* Q5 */  /* 89.1% cutoff (~100 dB stop) 10 */
+   { 96, 16, 0.940f, 0.945f, KAISER10}, /* Q6 */  /* 91.5% cutoff (~100 dB stop) 10 */
+   {128, 16, 0.950f, 0.950f, KAISER10}, /* Q7 */  /* 93.1% cutoff (~100 dB stop) 10 */
+   {160, 16, 0.960f, 0.960f, KAISER10}, /* Q8 */  /* 94.5% cutoff (~100 dB stop) 10 */
+   {192, 32, 0.968f, 0.968f, KAISER12}, /* Q9 */  /* 95.5% cutoff (~100 dB stop) 10 */
+   {256, 32, 0.975f, 0.975f, KAISER12}, /* Q10 */ /* 96.6% cutoff (~100 dB stop) 10 */
+};
+/*8,24,40,56,80,104,128,160,200,256,320*/
+static double compute_func(float x, struct FuncDef *func)
+{
+   float y, frac;
+   double interp[4];
+   int ind; 
+   y = x*func->oversample;
+   ind = (int)floor(y);
+   frac = (y-ind);
+   /* CSE with handle the repeated powers */
+   interp[3] =  -0.1666666667*frac + 0.1666666667*(frac*frac*frac);
+   interp[2] = frac + 0.5*(frac*frac) - 0.5*(frac*frac*frac);
+   /*interp[2] = 1.f - 0.5f*frac - frac*frac + 0.5f*frac*frac*frac;*/
+   interp[0] = -0.3333333333*frac + 0.5*(frac*frac) - 0.1666666667*(frac*frac*frac);
+   /* Just to make sure we don't have rounding problems */
+   interp[1] = 1.f-interp[3]-interp[2]-interp[0];
+   
+   /*sum = frac*accum[1] + (1-frac)*accum[2];*/
+   return interp[0]*func->table[ind] + interp[1]*func->table[ind+1] + interp[2]*func->table[ind+2] + interp[3]*func->table[ind+3];
+}
+
+#if 0
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+   int i;
+   for (i=0;i<256;i++)
+   {
+      printf ("%f\n", compute_func(i/256., KAISER12));
+   }
+   return 0;
+}
+#endif
+
+#ifdef FIXED_POINT
+/* The slow way of computing a sinc for the table. Should improve that some day */
+static spx_word16_t sinc(float cutoff, float x, int N, struct FuncDef *window_func)
+{
+   /*fprintf (stderr, "%f ", x);*/
+   float xx = x * cutoff;
+   if (fabs(x)<1e-6f)
+      return WORD2INT(32768.*cutoff);
+   else if (fabs(x) > .5f*N)
+      return 0;
+   /*FIXME: Can it really be any slower than this? */
+   return WORD2INT(32768.*cutoff*sin(M_PI*xx)/(M_PI*xx) * compute_func(fabs(2.*x/N), window_func));
+}
+#else
+/* The slow way of computing a sinc for the table. Should improve that some day */
+static spx_word16_t sinc(float cutoff, float x, int N, struct FuncDef *window_func)
+{
+   /*fprintf (stderr, "%f ", x);*/
+   float xx = x * cutoff;
+   if (fabs(x)<1e-6)
+      return cutoff;
+   else if (fabs(x) > .5*N)
+      return 0;
+   /*FIXME: Can it really be any slower than this? */
+   return cutoff*sin(M_PI*xx)/(M_PI*xx) * compute_func(fabs(2.*x/N), window_func);
+}
+#endif
+
+#ifdef FIXED_POINT
+static void cubic_coef(spx_word16_t x, spx_word16_t interp[4])
+{
+   /* Compute interpolation coefficients. I'm not sure whether this corresponds to cubic interpolation
+   but I know it's MMSE-optimal on a sinc */
+   spx_word16_t x2, x3;
+   x2 = MULT16_16_P15(x, x);
+   x3 = MULT16_16_P15(x, x2);
+   interp[0] = PSHR32(MULT16_16(QCONST16(-0.16667f, 15),x) + MULT16_16(QCONST16(0.16667f, 15),x3),15);
+   interp[1] = EXTRACT16(EXTEND32(x) + SHR32(SUB32(EXTEND32(x2),EXTEND32(x3)),1));
+   interp[3] = PSHR32(MULT16_16(QCONST16(-0.33333f, 15),x) + MULT16_16(QCONST16(.5f,15),x2) - MULT16_16(QCONST16(0.16667f, 15),x3),15);
+   /* Just to make sure we don't have rounding problems */
+   interp[2] = Q15_ONE-interp[0]-interp[1]-interp[3];
+   if (interp[2]<32767)
+      interp[2]+=1;
+}
+#else
+static void cubic_coef(spx_word16_t frac, spx_word16_t interp[4])
+{
+   /* Compute interpolation coefficients. I'm not sure whether this corresponds to cubic interpolation
+   but I know it's MMSE-optimal on a sinc */
+   interp[0] =  -0.16667f*frac + 0.16667f*frac*frac*frac;
+   interp[1] = frac + 0.5f*frac*frac - 0.5f*frac*frac*frac;
+   /*interp[2] = 1.f - 0.5f*frac - frac*frac + 0.5f*frac*frac*frac;*/
+   interp[3] = -0.33333f*frac + 0.5f*frac*frac - 0.16667f*frac*frac*frac;
+   /* Just to make sure we don't have rounding problems */
+   interp[2] = 1.-interp[0]-interp[1]-interp[3];
+}
+#endif
+
+static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_word16_t *in, spx_uint32_t *in_len, spx_word16_t *out, spx_uint32_t *out_len)
+{
+   int N = st->filt_len;
+   int out_sample = 0;
+   spx_word16_t *mem;
+   int last_sample = st->last_sample[channel_index];
+   spx_uint32_t samp_frac_num = st->samp_frac_num[channel_index];
+   mem = st->mem + channel_index * st->mem_alloc_size;
+   while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
+   {
+      int j;
+      spx_word32_t sum=0;
+      
+      /* We already have all the filter coefficients pre-computed in the table */
+      const spx_word16_t *ptr;
+      /* Do the memory part */
+      for (j=0;last_sample-N+1+j < 0;j++)
+      {
+         sum += MULT16_16(mem[last_sample+j],st->sinc_table[samp_frac_num*st->filt_len+j]);
+      }
+      
+      /* Do the new part */
+      ptr = in+st->in_stride*(last_sample-N+1+j);
+      for (;j<N;j++)
+      {
+         sum += MULT16_16(*ptr,st->sinc_table[samp_frac_num*st->filt_len+j]);
+         ptr += st->in_stride;
+      }
+   
+      *out = PSHR32(sum,15);
+      out += st->out_stride;
+      out_sample++;
+      last_sample += st->int_advance;
+      samp_frac_num += st->frac_advance;
+      if (samp_frac_num >= st->den_rate)
+      {
+         samp_frac_num -= st->den_rate;
+         last_sample++;
+      }
+   }
+   st->last_sample[channel_index] = last_sample;
+   st->samp_frac_num[channel_index] = samp_frac_num;
+   return out_sample;
+}
+
+#ifdef FIXED_POINT
+#else
+/* This is the same as the previous function, except with a double-precision accumulator */
+static int resampler_basic_direct_double(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_word16_t *in, spx_uint32_t *in_len, spx_word16_t *out, spx_uint32_t *out_len)
+{
+   int N = st->filt_len;
+   int out_sample = 0;
+   spx_word16_t *mem;
+   int last_sample = st->last_sample[channel_index];
+   spx_uint32_t samp_frac_num = st->samp_frac_num[channel_index];
+   mem = st->mem + channel_index * st->mem_alloc_size;
+   while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
+   {
+      int j;
+      double sum=0;
+      
+      /* We already have all the filter coefficients pre-computed in the table */
+      const spx_word16_t *ptr;
+      /* Do the memory part */
+      for (j=0;last_sample-N+1+j < 0;j++)
+      {
+         sum += MULT16_16(mem[last_sample+j],(double)st->sinc_table[samp_frac_num*st->filt_len+j]);
+      }
+      
+      /* Do the new part */
+      ptr = in+st->in_stride*(last_sample-N+1+j);
+      for (;j<N;j++)
+      {
+         sum += MULT16_16(*ptr,(double)st->sinc_table[samp_frac_num*st->filt_len+j]);
+         ptr += st->in_stride;
+      }
+   
+      *out = sum;
+      out += st->out_stride;
+      out_sample++;
+      last_sample += st->int_advance;
+      samp_frac_num += st->frac_advance;
+      if (samp_frac_num >= st->den_rate)
+      {
+         samp_frac_num -= st->den_rate;
+         last_sample++;
+      }
+   }
+   st->last_sample[channel_index] = last_sample;
+   st->samp_frac_num[channel_index] = samp_frac_num;
+   return out_sample;
+}
+#endif
+
+static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_word16_t *in, spx_uint32_t *in_len, spx_word16_t *out, spx_uint32_t *out_len)
+{
+   int N = st->filt_len;
+   int out_sample = 0;
+   spx_word16_t *mem;
+   int last_sample = st->last_sample[channel_index];
+   spx_uint32_t samp_frac_num = st->samp_frac_num[channel_index];
+   mem = st->mem + channel_index * st->mem_alloc_size;
+   while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
+   {
+      int j;
+      spx_word32_t sum=0;
+      
+      /* We need to interpolate the sinc filter */
+      spx_word32_t accum[4] = {0.f,0.f, 0.f, 0.f};
+      spx_word16_t interp[4];
+      const spx_word16_t *ptr;
+      int offset;
+      spx_word16_t frac;
+      offset = samp_frac_num*st->oversample/st->den_rate;
+#ifdef FIXED_POINT
+      frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
+#else
+      frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
+#endif
+         /* This code is written like this to make it easy to optimise with SIMD.
+      For most DSPs, it would be best to split the loops in two because most DSPs 
+      have only two accumulators */
+      for (j=0;last_sample-N+1+j < 0;j++)
+      {
+         spx_word16_t curr_mem = mem[last_sample+j];
+         accum[0] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+      }
+      ptr = in+st->in_stride*(last_sample-N+1+j);
+      /* Do the new part */
+      for (;j<N;j++)
+      {
+         spx_word16_t curr_in = *ptr;
+         ptr += st->in_stride;
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+      }
+      cubic_coef(frac, interp);
+      sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
+   
+      *out = PSHR32(sum,15);
+      out += st->out_stride;
+      out_sample++;
+      last_sample += st->int_advance;
+      samp_frac_num += st->frac_advance;
+      if (samp_frac_num >= st->den_rate)
+      {
+         samp_frac_num -= st->den_rate;
+         last_sample++;
+      }
+   }
+   st->last_sample[channel_index] = last_sample;
+   st->samp_frac_num[channel_index] = samp_frac_num;
+   return out_sample;
+}
+
+#ifdef FIXED_POINT
+#else
+/* This is the same as the previous function, except with a double-precision accumulator */
+static int resampler_basic_interpolate_double(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_word16_t *in, spx_uint32_t *in_len, spx_word16_t *out, spx_uint32_t *out_len)
+{
+   int N = st->filt_len;
+   int out_sample = 0;
+   spx_word16_t *mem;
+   int last_sample = st->last_sample[channel_index];
+   spx_uint32_t samp_frac_num = st->samp_frac_num[channel_index];
+   mem = st->mem + channel_index * st->mem_alloc_size;
+   while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
+   {
+      int j;
+      spx_word32_t sum=0;
+      
+      /* We need to interpolate the sinc filter */
+      double accum[4] = {0.f,0.f, 0.f, 0.f};
+      float interp[4];
+      const spx_word16_t *ptr;
+      float alpha = ((float)samp_frac_num)/st->den_rate;
+      int offset = samp_frac_num*st->oversample/st->den_rate;
+      float frac = alpha*st->oversample - offset;
+         /* This code is written like this to make it easy to optimise with SIMD.
+      For most DSPs, it would be best to split the loops in two because most DSPs 
+      have only two accumulators */
+      for (j=0;last_sample-N+1+j < 0;j++)
+      {
+         double curr_mem = mem[last_sample+j];
+         accum[0] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_mem,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+      }
+      ptr = in+st->in_stride*(last_sample-N+1+j);
+      /* Do the new part */
+      for (;j<N;j++)
+      {
+         double curr_in = *ptr;
+         ptr += st->in_stride;
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+      }
+      cubic_coef(frac, interp);
+      sum = interp[0]*accum[0] + interp[1]*accum[1] + interp[2]*accum[2] + interp[3]*accum[3];
+   
+      *out = PSHR32(sum,15);
+      out += st->out_stride;
+      out_sample++;
+      last_sample += st->int_advance;
+      samp_frac_num += st->frac_advance;
+      if (samp_frac_num >= st->den_rate)
+      {
+         samp_frac_num -= st->den_rate;
+         last_sample++;
+      }
+   }
+   st->last_sample[channel_index] = last_sample;
+   st->samp_frac_num[channel_index] = samp_frac_num;
+   return out_sample;
+}
+#endif
+
+static void update_filter(SpeexResamplerState *st)
+{
+   spx_uint32_t old_length;
+   
+   old_length = st->filt_len;
+   st->oversample = quality_map[st->quality].oversample;
+   st->filt_len = quality_map[st->quality].base_length;
+   
+   if (st->num_rate > st->den_rate)
+   {
+      /* down-sampling */
+      st->cutoff = quality_map[st->quality].downsample_bandwidth * st->den_rate / st->num_rate;
+      /* FIXME: divide the numerator and denominator by a certain amount if they're too large */
+      st->filt_len = st->filt_len*st->num_rate / st->den_rate;
+      /* Round down to make sure we have a multiple of 4 */
+      st->filt_len &= (~0x3);
+      if (2*st->den_rate < st->num_rate)
+         st->oversample >>= 1;
+      if (4*st->den_rate < st->num_rate)
+         st->oversample >>= 1;
+      if (8*st->den_rate < st->num_rate)
+         st->oversample >>= 1;
+      if (16*st->den_rate < st->num_rate)
+         st->oversample >>= 1;
+      if (st->oversample < 1)
+         st->oversample = 1;
+   } else {
+      /* up-sampling */
+      st->cutoff = quality_map[st->quality].upsample_bandwidth;
+   }
+
+   /* Choose the resampling type that requires the least amount of memory */
+   if (st->den_rate <= st->oversample)
+   {
+      spx_uint32_t i;
+      if (!st->sinc_table)
+         st->sinc_table = (spx_word16_t *)speex_alloc(st->filt_len*st->den_rate*sizeof(spx_word16_t));
+      else if (st->sinc_table_length < st->filt_len*st->den_rate)
+      {
+         st->sinc_table = (spx_word16_t *)speex_realloc(st->sinc_table,st->filt_len*st->den_rate*sizeof(spx_word16_t));
+         st->sinc_table_length = st->filt_len*st->den_rate;
+      }
+      for (i=0;i<st->den_rate;i++)
+      {
+         spx_uint32_t j;
+         for (j=0;j<st->filt_len;j++)
+         {
+            st->sinc_table[i*st->filt_len+j] = sinc(st->cutoff,((j-st->filt_len/2+1)-((float)i)/st->den_rate), st->filt_len, quality_map[st->quality].window_func);
+         }
+      }
+#ifdef FIXED_POINT
+      st->resampler_ptr = resampler_basic_direct_single;
+#else
+      if (st->quality>8)
+         st->resampler_ptr = resampler_basic_direct_double;
+      else
+         st->resampler_ptr = resampler_basic_direct_single;
+#endif
+      /*fprintf (stderr, "resampler uses direct sinc table and normalised cutoff %f\n", cutoff);*/
+   } else {
+      spx_int32_t i;
+      if (!st->sinc_table)
+         st->sinc_table = (spx_word16_t *)speex_alloc((st->filt_len*st->oversample+8)*sizeof(spx_word16_t));
+      else if (st->sinc_table_length < st->filt_len*st->oversample+8)
+      {
+         st->sinc_table = (spx_word16_t *)speex_realloc(st->sinc_table,(st->filt_len*st->oversample+8)*sizeof(spx_word16_t));
+         st->sinc_table_length = st->filt_len*st->oversample+8;
+      }
+      for (i=-4;i<(spx_int32_t)(st->oversample*st->filt_len+4);i++)
+         st->sinc_table[i+4] = sinc(st->cutoff,(i/(float)st->oversample - st->filt_len/2), st->filt_len, quality_map[st->quality].window_func);
+#ifdef FIXED_POINT
+      st->resampler_ptr = resampler_basic_interpolate_single;
+#else
+      if (st->quality>8)
+         st->resampler_ptr = resampler_basic_interpolate_double;
+      else
+         st->resampler_ptr = resampler_basic_interpolate_single;
+#endif
+      /*fprintf (stderr, "resampler uses interpolated sinc table and normalised cutoff %f\n", cutoff);*/
+   }
+   st->int_advance = st->num_rate/st->den_rate;
+   st->frac_advance = st->num_rate%st->den_rate;
+
+   if (!st->mem)
+   {
+      spx_uint32_t i;
+      st->mem = (spx_word16_t*)speex_alloc(st->nb_channels*(st->filt_len-1) * sizeof(spx_word16_t));
+      for (i=0;i<st->nb_channels*(st->filt_len-1);i++)
+         st->mem[i] = 0;
+      st->mem_alloc_size = st->filt_len-1;
+      /*speex_warning("init filter");*/
+   } else if (!st->started)
+   {
+      spx_uint32_t i;
+      st->mem = (spx_word16_t*)speex_realloc(st->mem, st->nb_channels*(st->filt_len-1) * sizeof(spx_word16_t));
+      for (i=0;i<st->nb_channels*(st->filt_len-1);i++)
+         st->mem[i] = 0;
+      st->mem_alloc_size = st->filt_len-1;
+      /*speex_warning("reinit filter");*/
+   } else if (st->filt_len > old_length)
+   {
+      spx_uint32_t i;
+      /* Increase the filter length */
+      /*speex_warning("increase filter size");*/
+      int old_alloc_size = st->mem_alloc_size;
+      if (st->filt_len-1 > st->mem_alloc_size)
+      {
+         st->mem = (spx_word16_t*)speex_realloc(st->mem, st->nb_channels*(st->filt_len-1) * sizeof(spx_word16_t));
+         st->mem_alloc_size = st->filt_len-1;
+      }
+      for (i=0;i<st->nb_channels;i++)
+      {
+         spx_uint32_t j;
+         /* Copy data going backward */
+         for (j=0;j<old_length-1;j++)
+            st->mem[i*st->mem_alloc_size+(st->filt_len-2-j)] = st->mem[i*old_alloc_size+(old_length-2-j)];
+         /* Then put zeros for lack of anything better */
+         for (;j<st->filt_len-1;j++)
+            st->mem[i*st->mem_alloc_size+(st->filt_len-2-j)] = 0;
+         /* Adjust last_sample */
+         st->last_sample[i] += (st->filt_len - old_length)/2;
+      }
+   } else if (st->filt_len < old_length)
+   {
+      spx_uint32_t i;
+      /* Reduce filter length, this a bit tricky */
+      /*speex_warning("decrease filter size (unimplemented)");*/
+      /* Adjust last_sample (which will likely end up negative) */
+      /*st->last_sample += (st->filt_len - old_length)/2;*/
+      for (i=0;i<st->nb_channels;i++)
+      {
+         spx_uint32_t j;
+         st->magic_samples[i] = (old_length - st->filt_len)/2;
+         /* Copy data going backward */
+         for (j=0;j<st->filt_len-1+st->magic_samples[i];j++)
+            st->mem[i*st->mem_alloc_size+j] = st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]];
+      }
+   }
+
+}
+
+SpeexResamplerState *speex_resampler_init(spx_uint32_t nb_channels, spx_uint32_t in_rate, spx_uint32_t out_rate, int quality, int *err)
+{
+   return speex_resampler_init_frac(nb_channels, in_rate, out_rate, in_rate, out_rate, quality, err);
+}
+
+SpeexResamplerState *speex_resampler_init_frac(spx_uint32_t nb_channels, spx_uint32_t ratio_num, spx_uint32_t ratio_den, spx_uint32_t in_rate, spx_uint32_t out_rate, int quality, int *err)
+{
+   spx_uint32_t i;
+   SpeexResamplerState *st;
+   if (quality > 10 || quality < 0)
+   {
+      if (err)
+         *err = RESAMPLER_ERR_INVALID_ARG;
+      return NULL;
+   }
+   st = (SpeexResamplerState *)speex_alloc(sizeof(SpeexResamplerState));
+   st->initialised = 0;
+   st->started = 0;
+   st->in_rate = 0;
+   st->out_rate = 0;
+   st->num_rate = 0;
+   st->den_rate = 0;
+   st->quality = -1;
+   st->sinc_table_length = 0;
+   st->mem_alloc_size = 0;
+   st->filt_len = 0;
+   st->mem = 0;
+   st->resampler_ptr = 0;
+         
+   st->cutoff = 1.f;
+   st->nb_channels = nb_channels;
+   st->in_stride = 1;
+   st->out_stride = 1;
+   
+   /* Per channel data */
+   st->last_sample = (spx_int32_t*)speex_alloc(nb_channels*sizeof(int));
+   st->magic_samples = (spx_uint32_t*)speex_alloc(nb_channels*sizeof(int));
+   st->samp_frac_num = (spx_uint32_t*)speex_alloc(nb_channels*sizeof(int));
+   for (i=0;i<nb_channels;i++)
+   {
+      st->last_sample[i] = 0;
+      st->magic_samples[i] = 0;
+      st->samp_frac_num[i] = 0;
+   }
+
+   speex_resampler_set_quality(st, quality);
+   speex_resampler_set_rate_frac(st, ratio_num, ratio_den, in_rate, out_rate);
+
+   
+   update_filter(st);
+   
+   st->initialised = 1;
+   if (err)
+      *err = RESAMPLER_ERR_SUCCESS;
+
+   return st;
+}
+
+void speex_resampler_destroy(SpeexResamplerState *st)
+{
+   speex_free(st->mem);
+   speex_free(st->sinc_table);
+   speex_free(st->last_sample);
+   speex_free(st->magic_samples);
+   speex_free(st->samp_frac_num);
+   speex_free(st);
+}
+
+
+
+static int speex_resampler_process_native(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_word16_t *in, spx_uint32_t *in_len, spx_word16_t *out, spx_uint32_t *out_len)
+{
+   int j=0;
+   int N = st->filt_len;
+   int out_sample = 0;
+   spx_word16_t *mem;
+   spx_uint32_t tmp_out_len = 0;
+   mem = st->mem + channel_index * st->mem_alloc_size;
+   st->started = 1;
+   
+   /* Handle the case where we have samples left from a reduction in filter length */
+   if (st->magic_samples[channel_index])
+   {
+      spx_uint32_t tmp_in_len;
+      spx_uint32_t tmp_magic;
+      tmp_in_len = st->magic_samples[channel_index];
+      tmp_out_len = *out_len;
+      /* FIXME: Need to handle the case where the out array is too small */
+      /* magic_samples needs to be set to zero to avoid infinite recursion */
+      tmp_magic = st->magic_samples[channel_index];
+      st->magic_samples[channel_index] = 0;
+      speex_resampler_process_native(st, channel_index, mem+N-1, &tmp_in_len, out, &tmp_out_len);
+      /*speex_warning_int("extra samples:", tmp_out_len);*/
+      /* If we couldn't process all "magic" input samples, save the rest for next time */
+      if (tmp_in_len < tmp_magic)
+      {
+         spx_uint32_t i;
+         st->magic_samples[channel_index] = tmp_magic-tmp_in_len;
+         for (i=0;i<st->magic_samples[channel_index];i++)
+            mem[N-1+i]=mem[N-1+i+tmp_in_len];
+      }
+      out += tmp_out_len;
+   }
+   
+   /* Call the right resampler through the function ptr */
+   out_sample = st->resampler_ptr(st, channel_index, in, in_len, out, out_len);
+   
+   if (st->last_sample[channel_index] < (spx_int32_t)*in_len)
+      *in_len = st->last_sample[channel_index];
+   *out_len = out_sample+tmp_out_len;
+   st->last_sample[channel_index] -= *in_len;
+   
+   for (j=0;j<N-1-(spx_int32_t)*in_len;j++)
+      mem[j] = mem[j+*in_len];
+   for (;j<N-1;j++)
+      mem[j] = in[st->in_stride*(j+*in_len-N+1)];
+   
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+#define FIXED_STACK_ALLOC 1024
+
+#ifdef FIXED_POINT
+int speex_resampler_process_float(SpeexResamplerState *st, spx_uint32_t channel_index, const float *in, spx_uint32_t *in_len, float *out, spx_uint32_t *out_len)
+{
+   spx_uint32_t i;
+   int istride_save, ostride_save;
+#ifdef VAR_ARRAYS
+   spx_word16_t x[*in_len];
+   spx_word16_t y[*out_len];
+   /*VARDECL(spx_word16_t *x);
+   VARDECL(spx_word16_t *y);
+   ALLOC(x, *in_len, spx_word16_t);
+   ALLOC(y, *out_len, spx_word16_t);*/
+   istride_save = st->in_stride;
+   ostride_save = st->out_stride;
+   for (i=0;i<*in_len;i++)
+      x[i] = WORD2INT(in[i*st->in_stride]);
+   st->in_stride = st->out_stride = 1;
+   speex_resampler_process_native(st, channel_index, x, in_len, y, out_len);
+   st->in_stride = istride_save;
+   st->out_stride = ostride_save;
+   for (i=0;i<*out_len;i++)
+      out[i*st->out_stride] = y[i];
+#else
+   spx_word16_t x[FIXED_STACK_ALLOC];
+   spx_word16_t y[FIXED_STACK_ALLOC];
+   spx_uint32_t ilen=*in_len, olen=*out_len;
+   istride_save = st->in_stride;
+   ostride_save = st->out_stride;
+   while (ilen && olen)
+   {
+      spx_uint32_t ichunk, ochunk;
+      ichunk = ilen;
+      ochunk = olen;
+      if (ichunk>FIXED_STACK_ALLOC)
+         ichunk=FIXED_STACK_ALLOC;
+      if (ochunk>FIXED_STACK_ALLOC)
+         ochunk=FIXED_STACK_ALLOC;
+      for (i=0;i<ichunk;i++)
+         x[i] = WORD2INT(in[i*st->in_stride]);
+      st->in_stride = st->out_stride = 1;
+      speex_resampler_process_native(st, channel_index, x, &ichunk, y, &ochunk);
+      st->in_stride = istride_save;
+      st->out_stride = ostride_save;
+      for (i=0;i<ochunk;i++)
+         out[i*st->out_stride] = y[i];
+      out += ochunk;
+      in += ichunk;
+      ilen -= ichunk;
+      olen -= ochunk;
+   }
+   *in_len -= ilen;
+   *out_len -= olen;   
+#endif
+   return RESAMPLER_ERR_SUCCESS;
+}
+int speex_resampler_process_int(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_int16_t *in, spx_uint32_t *in_len, spx_int16_t *out, spx_uint32_t *out_len)
+{
+   return speex_resampler_process_native(st, channel_index, in, in_len, out, out_len);
+}
+#else
+int speex_resampler_process_float(SpeexResamplerState *st, spx_uint32_t channel_index, const float *in, spx_uint32_t *in_len, float *out, spx_uint32_t *out_len)
+{
+   return speex_resampler_process_native(st, channel_index, in, in_len, out, out_len);
+}
+int speex_resampler_process_int(SpeexResamplerState *st, spx_uint32_t channel_index, const spx_int16_t *in, spx_uint32_t *in_len, spx_int16_t *out, spx_uint32_t *out_len)
+{
+   spx_uint32_t i;
+   int istride_save, ostride_save;
+#ifdef VAR_ARRAYS
+   spx_word16_t x[*in_len];
+   spx_word16_t y[*out_len];
+   /*VARDECL(spx_word16_t *x);
+   VARDECL(spx_word16_t *y);
+   ALLOC(x, *in_len, spx_word16_t);
+   ALLOC(y, *out_len, spx_word16_t);*/
+   istride_save = st->in_stride;
+   ostride_save = st->out_stride;
+   for (i=0;i<*in_len;i++)
+      x[i] = in[i*st->in_stride];
+   st->in_stride = st->out_stride = 1;
+   speex_resampler_process_native(st, channel_index, x, in_len, y, out_len);
+   st->in_stride = istride_save;
+   st->out_stride = ostride_save;
+   for (i=0;i<*out_len;i++)
+      out[i*st->out_stride] = WORD2INT(y[i]);
+#else
+   spx_word16_t x[FIXED_STACK_ALLOC];
+   spx_word16_t y[FIXED_STACK_ALLOC];
+   spx_uint32_t ilen=*in_len, olen=*out_len;
+   istride_save = st->in_stride;
+   ostride_save = st->out_stride;
+   while (ilen && olen)
+   {
+      spx_uint32_t ichunk, ochunk;
+      ichunk = ilen;
+      ochunk = olen;
+      if (ichunk>FIXED_STACK_ALLOC)
+         ichunk=FIXED_STACK_ALLOC;
+      if (ochunk>FIXED_STACK_ALLOC)
+         ochunk=FIXED_STACK_ALLOC;
+      for (i=0;i<ichunk;i++)
+         x[i] = in[i*st->in_stride];
+      st->in_stride = st->out_stride = 1;
+      speex_resampler_process_native(st, channel_index, x, &ichunk, y, &ochunk);
+      st->in_stride = istride_save;
+      st->out_stride = ostride_save;
+      for (i=0;i<ochunk;i++)
+         out[i*st->out_stride] = WORD2INT(y[i]);
+      out += ochunk;
+      in += ichunk;
+      ilen -= ichunk;
+      olen -= ochunk;
+   }
+   *in_len -= ilen;
+   *out_len -= olen;   
+#endif
+   return RESAMPLER_ERR_SUCCESS;
+}
+#endif
+
+int speex_resampler_process_interleaved_float(SpeexResamplerState *st, const float *in, spx_uint32_t *in_len, float *out, spx_uint32_t *out_len)
+{
+   spx_uint32_t i;
+   int istride_save, ostride_save;
+   istride_save = st->in_stride;
+   ostride_save = st->out_stride;
+   st->in_stride = st->out_stride = st->nb_channels;
+   for (i=0;i<st->nb_channels;i++)
+   {
+      speex_resampler_process_float(st, i, in+i, in_len, out+i, out_len);
+   }
+   st->in_stride = istride_save;
+   st->out_stride = ostride_save;
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+int speex_resampler_process_interleaved_int(SpeexResamplerState *st, const spx_int16_t *in, spx_uint32_t *in_len, spx_int16_t *out, spx_uint32_t *out_len)
+{
+   spx_uint32_t i;
+   int istride_save, ostride_save;
+   istride_save = st->in_stride;
+   ostride_save = st->out_stride;
+   st->in_stride = st->out_stride = st->nb_channels;
+   for (i=0;i<st->nb_channels;i++)
+   {
+      speex_resampler_process_int(st, i, in+i, in_len, out+i, out_len);
+   }
+   st->in_stride = istride_save;
+   st->out_stride = ostride_save;
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+int speex_resampler_set_rate(SpeexResamplerState *st, spx_uint32_t in_rate, spx_uint32_t out_rate)
+{
+   return speex_resampler_set_rate_frac(st, in_rate, out_rate, in_rate, out_rate);
+}
+
+void speex_resampler_get_rate(SpeexResamplerState *st, spx_uint32_t *in_rate, spx_uint32_t *out_rate)
+{
+   *in_rate = st->in_rate;
+   *out_rate = st->out_rate;
+}
+
+int speex_resampler_set_rate_frac(SpeexResamplerState *st, spx_uint32_t ratio_num, spx_uint32_t ratio_den, spx_uint32_t in_rate, spx_uint32_t out_rate)
+{
+   int fact;
+   if (st->in_rate == in_rate && st->out_rate == out_rate && st->num_rate == ratio_num && st->den_rate == ratio_den)
+      return RESAMPLER_ERR_SUCCESS;
+   
+   st->in_rate = in_rate;
+   st->out_rate = out_rate;
+   st->num_rate = ratio_num;
+   st->den_rate = ratio_den;
+   /* FIXME: This is terribly inefficient, but who cares (at least for now)? */
+   for (fact=2;fact<=sqrt(IMAX(in_rate, out_rate));fact++)
+   {
+      while ((st->num_rate % fact == 0) && (st->den_rate % fact == 0))
+      {
+         st->num_rate /= fact;
+         st->den_rate /= fact;
+      }
+   }
+      
+   if (st->initialised)
+      update_filter(st);
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+void speex_resampler_get_ratio(SpeexResamplerState *st, spx_uint32_t *ratio_num, spx_uint32_t *ratio_den)
+{
+   *ratio_num = st->num_rate;
+   *ratio_den = st->den_rate;
+}
+
+int speex_resampler_set_quality(SpeexResamplerState *st, int quality)
+{
+   if (quality > 10 || quality < 0)
+      return RESAMPLER_ERR_INVALID_ARG;
+   if (st->quality == quality)
+      return RESAMPLER_ERR_SUCCESS;
+   st->quality = quality;
+   if (st->initialised)
+      update_filter(st);
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+void speex_resampler_get_quality(SpeexResamplerState *st, int *quality)
+{
+   *quality = st->quality;
+}
+
+void speex_resampler_set_input_stride(SpeexResamplerState *st, spx_uint32_t stride)
+{
+   st->in_stride = stride;
+}
+
+void speex_resampler_get_input_stride(SpeexResamplerState *st, spx_uint32_t *stride)
+{
+   *stride = st->in_stride;
+}
+
+void speex_resampler_set_output_stride(SpeexResamplerState *st, spx_uint32_t stride)
+{
+   st->out_stride = stride;
+}
+
+void speex_resampler_get_output_stride(SpeexResamplerState *st, spx_uint32_t *stride)
+{
+   *stride = st->out_stride;
+}
+
+int speex_resampler_skip_zeros(SpeexResamplerState *st)
+{
+   spx_uint32_t i;
+   for (i=0;i<st->nb_channels;i++)
+      st->last_sample[i] = st->filt_len/2;
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+int speex_resampler_reset_mem(SpeexResamplerState *st)
+{
+   spx_uint32_t i;
+   for (i=0;i<st->nb_channels*(st->filt_len-1);i++)
+      st->mem[i] = 0;
+   return RESAMPLER_ERR_SUCCESS;
+}
+
+const char *speex_resampler_strerror(int err)
+{
+   switch (err)
+   {
+      case RESAMPLER_ERR_SUCCESS:
+         return "Success.";
+      case RESAMPLER_ERR_ALLOC_FAILED:
+         return "Memory allocation failed.";
+      case RESAMPLER_ERR_BAD_STATE:
+         return "Bad resampler state.";
+      case RESAMPLER_ERR_INVALID_ARG:
+         return "Invalid argument.";
+      case RESAMPLER_ERR_PTR_OVERLAP:
+         return "Input and output buffers overlap.";
+      default:
+         return "Unknown error. Bad error code or strange version mismatch.";
+   }
+}
diff --git a/libspeex/sb_celp.c b/libspeex/sb_celp.c
index 89ba473..50b9824 100644
--- a/libspeex/sb_celp.c
+++ b/libspeex/sb_celp.c
@@ -45,6 +45,7 @@
 #include "vq.h"
 #include "ltp.h"
 #include "misc.h"
+#include "math_approx.h"
 
 /* Default size for the encoder and decoder stack (can be changed at compile time).
    This does not apply when using variable-size arrays or alloca. */
@@ -109,12 +110,26 @@ int sb_decoder_ctl(void *state, int request, void *ptr)
 
 #ifdef FIXED_POINT
 static const spx_word16_t gc_quant_bound[16] = {125, 164, 215, 282, 370, 484, 635, 832, 1090, 1428, 1871, 2452, 3213, 4210, 5516, 7228};
+static const spx_word16_t fold_quant_bound[32] = {
+   39, 44, 50, 57, 64, 73, 83, 94,
+   106, 120, 136, 154, 175, 198, 225, 255,
+   288, 327, 370, 420, 476, 539, 611, 692,
+   784, 889, 1007, 1141, 1293, 1465, 1660, 1881};
 #define LSP_MARGIN 410
 #define LSP_DELTA1 6553
 #define LSP_DELTA2 1638
 
 #else
 
+static const spx_word16_t gc_quant_bound[16] = {
+      0.97979, 1.28384, 1.68223, 2.20426, 2.88829, 3.78458, 4.95900, 6.49787, 
+      8.51428, 11.15642, 14.61846, 19.15484, 25.09895, 32.88761, 43.09325, 56.46588};
+static const spx_word16_t fold_quant_bound[32] = {
+   0.30498, 0.34559, 0.39161, 0.44375, 0.50283, 0.56979, 0.64565, 0.73162,
+   0.82903, 0.93942, 1.06450, 1.20624, 1.36685, 1.54884, 1.75506, 1.98875,
+   2.25355, 2.55360, 2.89361, 3.27889, 3.71547, 4.21018, 4.77076, 5.40598,
+   6.12577, 6.94141, 7.86565, 8.91295, 10.09969, 11.44445, 12.96826, 14.69497};
+
 #define LSP_MARGIN .05
 #define LSP_DELTA1 .2
 #define LSP_DELTA2 .05
@@ -126,122 +141,69 @@ static const spx_word16_t gc_quant_bound[16] = {125, 164, 215, 282, 370, 484, 63
 #ifdef FIXED_POINT
 static const spx_word16_t h0[64] = {2, -7, -7, 18, 15, -39, -25, 75, 35, -130, -41, 212, 38, -327, -17, 483, -32, -689, 124, 956, -283, -1307, 543, 1780, -973, -2467, 1733, 3633, -3339, -6409, 9059, 30153, 30153, 9059, -6409, -3339, 3633, 1733, -2467, -973, 1780, 543, -1307, -283, 956, 124, -689, -32, 483, -17, -327, 38, 212, -41, -130, 35, 75, -25, -39, 15, 18, -7, -7, 2};
 
-static const spx_word16_t h1[64] = {2, 7, -7, -18, 15, 39, -25, -75, 35, 130, -41, -212, 38, 327, -17, -483, -32, 689, 124, -956, -283, 1307, 543, -1780, -973, 2467, 1733, -3633, -3339, 6409, 9059, -30153, 30153, -9059, -6409, 3339, 3633, -1733, -2467, 973, 1780, -543, -1307, 283, 956, -124, -689, 32, 483, 17, -327, -38, 212, 41, -130, -35, 75, 25, -39, -15, 18, 7, -7, -2};
-
-
 #else
 static const float h0[64] = {
-   3.596189e-05, -0.0001123515,
-   -0.0001104587, 0.0002790277,
-   0.0002298438, -0.0005953563,
-   -0.0003823631, 0.00113826,
-   0.0005308539, -0.001986177,
-   -0.0006243724, 0.003235877,
-   0.0005743159, -0.004989147,
-   -0.0002584767, 0.007367171,
-   -0.0004857935, -0.01050689,
-   0.001894714, 0.01459396,
-   -0.004313674, -0.01994365,
-   0.00828756, 0.02716055,
-   -0.01485397, -0.03764973,
-   0.026447, 0.05543245,
-   -0.05095487, -0.09779096,
-   0.1382363, 0.4600981,
-   0.4600981, 0.1382363,
-   -0.09779096, -0.05095487,
-   0.05543245, 0.026447,
-   -0.03764973, -0.01485397,
-   0.02716055, 0.00828756,
-   -0.01994365, -0.004313674,
-   0.01459396, 0.001894714,
-   -0.01050689, -0.0004857935,
-   0.007367171, -0.0002584767,
-   -0.004989147, 0.0005743159,
-   0.003235877, -0.0006243724,
-   -0.001986177, 0.0005308539,
-   0.00113826, -0.0003823631,
-   -0.0005953563, 0.0002298438,
-   0.0002790277, -0.0001104587,
-   -0.0001123515, 3.596189e-05
+   3.596189e-05f, -0.0001123515f,
+   -0.0001104587f, 0.0002790277f,
+   0.0002298438f, -0.0005953563f,
+   -0.0003823631f, 0.00113826f,
+   0.0005308539f, -0.001986177f,
+   -0.0006243724f, 0.003235877f,
+   0.0005743159f, -0.004989147f,
+   -0.0002584767f, 0.007367171f,
+   -0.0004857935f, -0.01050689f,
+   0.001894714f, 0.01459396f,
+   -0.004313674f, -0.01994365f,
+   0.00828756f, 0.02716055f,
+   -0.01485397f, -0.03764973f,
+   0.026447f, 0.05543245f,
+   -0.05095487f, -0.09779096f,
+   0.1382363f, 0.4600981f,
+   0.4600981f, 0.1382363f,
+   -0.09779096f, -0.05095487f,
+   0.05543245f, 0.026447f,
+   -0.03764973f, -0.01485397f,
+   0.02716055f, 0.00828756f,
+   -0.01994365f, -0.004313674f,
+   0.01459396f, 0.001894714f,
+   -0.01050689f, -0.0004857935f,
+   0.007367171f, -0.0002584767f,
+   -0.004989147f, 0.0005743159f,
+   0.003235877f, -0.0006243724f,
+   -0.001986177f, 0.0005308539f,
+   0.00113826f, -0.0003823631f,
+   -0.0005953563f, 0.0002298438f,
+   0.0002790277f, -0.0001104587f,
+   -0.0001123515f, 3.596189e-05f
 };
 
-static const float h1[64] = {
-   3.596189e-05, 0.0001123515,
-   -0.0001104587, -0.0002790277,
-   0.0002298438, 0.0005953563,
-   -0.0003823631, -0.00113826,
-   0.0005308539, 0.001986177,
-   -0.0006243724, -0.003235877,
-   0.0005743159, 0.004989147,
-   -0.0002584767, -0.007367171,
-   -0.0004857935, 0.01050689,
-   0.001894714, -0.01459396,
-   -0.004313674, 0.01994365,
-   0.00828756, -0.02716055,
-   -0.01485397, 0.03764973,
-   0.026447, -0.05543245,
-   -0.05095487, 0.09779096,
-   0.1382363, -0.4600981,
-   0.4600981, -0.1382363,
-   -0.09779096, 0.05095487,
-   0.05543245, -0.026447,
-   -0.03764973, 0.01485397,
-   0.02716055, -0.00828756,
-   -0.01994365, 0.004313674,
-   0.01459396, -0.001894714,
-   -0.01050689, 0.0004857935,
-   0.007367171, 0.0002584767,
-   -0.004989147, -0.0005743159,
-   0.003235877, 0.0006243724,
-   -0.001986177, -0.0005308539,
-   0.00113826, 0.0003823631,
-   -0.0005953563, -0.0002298438,
-   0.0002790277, 0.0001104587,
-   -0.0001123515, -3.596189e-05
-};
 #endif
 
 extern const spx_word16_t lpc_window[];
 
-static void mix_and_saturate(spx_word32_t *x0, spx_word32_t *x1, spx_word16_t *out, int len)
-{
-   int i;
-   for (i=0;i<len;i++)
-   {
-      spx_word32_t tmp;
-#ifdef FIXED_POINT
-      tmp=PSHR(x0[i]-x1[i],SIG_SHIFT-1);
-#else
-      tmp=2*(x0[i]-x1[i]);
-#endif
-      if (tmp>32767)
-         out[i] = 32767;
-      else if (tmp<-32767)
-         out[i] = -32767;
-      else
-         out[i] = tmp;
-   }
-}
 
 void *sb_encoder_init(const SpeexMode *m)
 {
    int i;
+   spx_int32_t tmp;
    SBEncState *st;
    const SpeexSBMode *mode;
 
    st = (SBEncState*)speex_alloc(sizeof(SBEncState));
    if (!st)
       return NULL;
-#if defined(VAR_ARRAYS) || defined (USE_ALLOCA)
-   st->stack = NULL;
-#else
-   st->stack = (char*)speex_alloc_scratch(SB_ENC_STACK);
-#endif
    st->mode = m;
    mode = (const SpeexSBMode*)m->mode;
 
 
    st->st_low = speex_encoder_init(mode->nb_mode);
+#if defined(VAR_ARRAYS) || defined (USE_ALLOCA)
+   st->stack = NULL;
+#else
+   /*st->stack = (char*)speex_alloc_scratch(SB_ENC_STACK);*/
+   speex_encoder_ctl(st->st_low, SPEEX_GET_STACK, &st->stack);
+#endif
+
    st->full_frame_size = 2*mode->frameSize;
    st->frame_size = mode->frameSize;
    st->subframeSize = mode->subframeSize;
@@ -254,10 +216,10 @@ void *sb_encoder_init(const SpeexMode *m)
    st->submodes=mode->submodes;
    st->submodeSelect = st->submodeID=mode->defaultSubmode;
    
-   i=9;
-   speex_encoder_ctl(st->st_low, SPEEX_SET_QUALITY, &i);
-   i=1;
-   speex_encoder_ctl(st->st_low, SPEEX_SET_WIDEBAND, &i);
+   tmp=9;
+   speex_encoder_ctl(st->st_low, SPEEX_SET_QUALITY, &tmp);
+   tmp=1;
+   speex_encoder_ctl(st->st_low, SPEEX_SET_WIDEBAND, &tmp);
 
    st->lag_factor = mode->lag_factor;
    st->lpc_floor = mode->lpc_floor;
@@ -265,49 +227,33 @@ void *sb_encoder_init(const SpeexMode *m)
    st->gamma2=mode->gamma2;
    st->first=1;
 
-   st->x0d=(spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->x1d=(spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->high=(spx_sig_t*)speex_alloc((st->full_frame_size)*sizeof(spx_sig_t));
-   st->y0=(spx_sig_t*)speex_alloc((st->full_frame_size)*sizeof(spx_sig_t));
-   st->y1=(spx_sig_t*)speex_alloc((st->full_frame_size)*sizeof(spx_sig_t));
+   st->high=(spx_word16_t*)speex_alloc((st->windowSize-st->frame_size)*sizeof(spx_word16_t));
 
    st->h0_mem=(spx_word16_t*)speex_alloc((QMF_ORDER)*sizeof(spx_word16_t));
    st->h1_mem=(spx_word16_t*)speex_alloc((QMF_ORDER)*sizeof(spx_word16_t));
-   st->g0_mem=(spx_word32_t*)speex_alloc((QMF_ORDER)*sizeof(spx_word32_t));
-   st->g1_mem=(spx_word32_t*)speex_alloc((QMF_ORDER)*sizeof(spx_word32_t));
-
-   st->excBuf=(spx_sig_t*)speex_alloc((st->bufSize)*sizeof(spx_sig_t));
-   st->exc = st->excBuf + st->bufSize - st->windowSize;
 
-   st->res=(spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->sw=(spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
    st->window= lpc_window;
 
    st->lagWindow = (spx_word16_t*)speex_alloc((st->lpcSize+1)*sizeof(spx_word16_t));
    for (i=0;i<st->lpcSize+1;i++)
       st->lagWindow[i]=16384*exp(-.5*sqr(2*M_PI*st->lag_factor*i));
 
-   st->autocorr = (spx_word16_t*)speex_alloc((st->lpcSize+1)*sizeof(spx_word16_t));
-   st->lpc = (spx_coef_t*)speex_alloc(st->lpcSize*sizeof(spx_coef_t));
-   st->bw_lpc1 = (spx_coef_t*)speex_alloc(st->lpcSize*sizeof(spx_coef_t));
-   st->bw_lpc2 = (spx_coef_t*)speex_alloc(st->lpcSize*sizeof(spx_coef_t));
-   st->lsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
-   st->qlsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
    st->old_lsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
    st->old_qlsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
-   st->interp_lsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
-   st->interp_qlsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
-   st->interp_lpc = (spx_coef_t*)speex_alloc(st->lpcSize*sizeof(spx_coef_t));
    st->interp_qlpc = (spx_coef_t*)speex_alloc(st->lpcSize*sizeof(spx_coef_t));
    st->pi_gain = (spx_word32_t*)speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
-   st->low_innov = (spx_word32_t*)speex_alloc((st->frame_size)*sizeof(spx_word32_t));
-   speex_encoder_ctl(st->st_low, SPEEX_SET_INNOVATION_SAVE, st->low_innov);
-   st->innov_save = NULL;
+   st->exc_rms = (spx_word16_t*)speex_alloc((st->nbSubframes)*sizeof(spx_word16_t));
+   st->innov_rms_save = NULL;
    
    st->mem_sp = (spx_mem_t*)speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_sp2 = (spx_mem_t*)speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_sw = (spx_mem_t*)speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
 
+   for (i=0;i<st->lpcSize;i++)
+   {
+      st->old_lsp[i]=LSP_SCALING*(M_PI*((float)(i+1)))/(st->lpcSize+1);
+   }
+
    st->vbr_quality = 8;
    st->vbr_enabled = 0;
    st->vbr_max = 0;
@@ -331,38 +277,21 @@ void sb_encoder_destroy(void *state)
 
    speex_encoder_destroy(st->st_low);
 #if !(defined(VAR_ARRAYS) || defined (USE_ALLOCA))
-   speex_free_scratch(st->stack);
+   /*speex_free_scratch(st->stack);*/
 #endif
 
-   speex_free(st->x0d);
-   speex_free(st->x1d);
    speex_free(st->high);
-   speex_free(st->y0);
-   speex_free(st->y1);
 
    speex_free(st->h0_mem);
    speex_free(st->h1_mem);
-   speex_free(st->g0_mem);
-   speex_free(st->g1_mem);
 
-   speex_free(st->excBuf);
-   speex_free(st->res);
-   speex_free(st->sw);
    speex_free(st->lagWindow);
 
-   speex_free(st->autocorr);
-   speex_free(st->lpc);
-   speex_free(st->bw_lpc1);
-   speex_free(st->bw_lpc2);
-   speex_free(st->lsp);
-   speex_free(st->qlsp);
    speex_free(st->old_lsp);
    speex_free(st->old_qlsp);
-   speex_free(st->interp_lsp);
-   speex_free(st->interp_qlsp);
-   speex_free(st->interp_lpc);
    speex_free(st->interp_qlpc);
    speex_free(st->pi_gain);
+   speex_free(st->exc_rms);
 
    speex_free(st->mem_sp);
    speex_free(st->mem_sp2);
@@ -383,44 +312,56 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
    VARDECL(spx_word16_t *target);
    VARDECL(spx_word16_t *syn_resp);
    VARDECL(spx_word32_t *low_pi_gain);
-   VARDECL(spx_word16_t *low_exc);
+   spx_word16_t *low;
+   spx_word16_t *high;
+   VARDECL(spx_word16_t *low_exc_rms);
+   VARDECL(spx_word16_t *low_innov_rms);
    const SpeexSBMode *mode;
-   int dtx;
+   spx_int32_t dtx;
    spx_word16_t *in = (spx_word16_t*)vin;
-
+   spx_word16_t e_low=0, e_high=0;
+   VARDECL(spx_coef_t *lpc);
+   VARDECL(spx_coef_t *interp_lpc);
+   VARDECL(spx_coef_t *bw_lpc1);
+   VARDECL(spx_coef_t *bw_lpc2);
+   VARDECL(spx_lsp_t *lsp);
+   VARDECL(spx_lsp_t *qlsp);
+   VARDECL(spx_lsp_t *interp_lsp);
+   VARDECL(spx_lsp_t *interp_qlsp);
+      
    st = (SBEncState*)state;
    stack=st->stack;
    mode = (const SpeexSBMode*)(st->mode->mode);
-
+   low = in;
+   high = in+st->frame_size;
+   
+   /* High-band buffering / sync with low band */
+   /* Compute the two sub-bands by filtering with QMF h0*/
+   qmf_decomp(in, h0, low, high, st->full_frame_size, QMF_ORDER, st->h0_mem, stack);
+   
+   if (st->vbr_enabled || st->vad_enabled)
    {
-      VARDECL(spx_word16_t *low);
-      ALLOC(low, st->frame_size, spx_word16_t);
-
-      /* Compute the two sub-bands by filtering with h0 and h1*/
-      qmf_decomp(in, h0, st->x0d, st->x1d, st->full_frame_size, QMF_ORDER, st->h0_mem, stack);
-      
-      for (i=0;i<st->frame_size;i++)
-         low[i] = SATURATE(PSHR(st->x0d[i],SIG_SHIFT),32767);
-      
-      /* Encode the narrowband part*/
-      speex_encode_native(st->st_low, low, bits);
-
-      for (i=0;i<st->frame_size;i++)
-         st->x0d[i] = SHL(low[i],SIG_SHIFT);
+      /* Need to compute things here before the signal is trashed by the encoder */
+      /*FIXME: Are the two signals (low, high) in sync? */
+      e_low = compute_rms16(low, st->frame_size);
+      e_high = compute_rms16(high, st->frame_size);
    }
-   /* High-band buffering / sync with low band */
-   for (i=0;i<st->windowSize-st->frame_size;i++)
-      st->high[i] = st->high[st->frame_size+i];
-   for (i=0;i<st->frame_size;i++)
-      st->high[st->windowSize-st->frame_size+i]=SATURATE(st->x1d[i],536854528);
-
-   speex_move(st->excBuf, st->excBuf+st->frame_size, (st->bufSize-st->frame_size)*sizeof(spx_sig_t));
+   ALLOC(low_innov_rms, st->nbSubframes, spx_word16_t);
+   speex_encoder_ctl(st->st_low, SPEEX_SET_INNOVATION_SAVE, low_innov_rms);
+   /* Encode the narrowband part*/
+   speex_encode_native(st->st_low, low, bits);
 
+   high = high - (st->windowSize-st->frame_size);
+   for (i=0;i<st->windowSize-st->frame_size;i++)
+      high[i] = st->high[i];
+   for (i=0;i<st->windowSize-st->frame_size;i++)
+      st->high[i] = high[i+st->frame_size];
+   
 
    ALLOC(low_pi_gain, st->nbSubframes, spx_word32_t);
-   ALLOC(low_exc, st->frame_size, spx_word16_t);
+   ALLOC(low_exc_rms, st->nbSubframes, spx_word16_t);
    speex_encoder_ctl(st->st_low, SPEEX_GET_PI_GAIN, low_pi_gain);
-   speex_encoder_ctl(st->st_low, SPEEX_GET_EXC, low_exc);
+   speex_encoder_ctl(st->st_low, SPEEX_GET_EXC, low_exc_rms);
    
    speex_encoder_ctl(st->st_low, SPEEX_GET_LOW_MODE, &dtx);
 
@@ -429,35 +370,53 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
    else
       dtx=0;
 
+   ALLOC(lpc, st->lpcSize, spx_coef_t);
+   ALLOC(interp_lpc, st->lpcSize, spx_coef_t);
+   ALLOC(bw_lpc1, st->lpcSize, spx_coef_t);
+   ALLOC(bw_lpc2, st->lpcSize, spx_coef_t);
+   
+   ALLOC(lsp, st->lpcSize, spx_lsp_t);
+   ALLOC(qlsp, st->lpcSize, spx_lsp_t);
+   ALLOC(interp_lsp, st->lpcSize, spx_lsp_t);
+   ALLOC(interp_qlsp, st->lpcSize, spx_lsp_t);
+   
    {
+      VARDECL(spx_word16_t *autocorr);
       VARDECL(spx_word16_t *w_sig);
+      ALLOC(autocorr, st->lpcSize+1, spx_word16_t);
       ALLOC(w_sig, st->windowSize, spx_word16_t);
       /* Window for analysis */
-      for (i=0;i<st->windowSize;i++)
-         w_sig[i] = SHR(MULT16_16(SHR((spx_word32_t)(st->high[i]),SIG_SHIFT),st->window[i]),SIG_SHIFT);
-
+      /* FIXME: This is a kludge */
+      if (st->subframeSize==80)
+      {
+         for (i=0;i<st->windowSize;i++)
+            w_sig[i] = EXTRACT16(SHR32(MULT16_16(high[i],st->window[i>>1]),SIG_SHIFT));
+      } else {
+         for (i=0;i<st->windowSize;i++)
+            w_sig[i] = EXTRACT16(SHR32(MULT16_16(high[i],st->window[i]),SIG_SHIFT));
+      }
       /* Compute auto-correlation */
-      _spx_autocorr(w_sig, st->autocorr, st->lpcSize+1, st->windowSize);
-   }
-   st->autocorr[0] = ADD16(st->autocorr[0],MULT16_16_Q15(st->autocorr[0],st->lpc_floor)); /* Noise floor in auto-correlation domain */
+      _spx_autocorr(w_sig, autocorr, st->lpcSize+1, st->windowSize);
+      autocorr[0] = ADD16(autocorr[0],MULT16_16_Q15(autocorr[0],st->lpc_floor)); /* Noise floor in auto-correlation domain */
 
-   /* Lag windowing: equivalent to filtering in the power-spectrum domain */
-   for (i=0;i<st->lpcSize+1;i++)
-      st->autocorr[i] = MULT16_16_Q14(st->autocorr[i],st->lagWindow[i]);
+      /* Lag windowing: equivalent to filtering in the power-spectrum domain */
+      for (i=0;i<st->lpcSize+1;i++)
+         autocorr[i] = MULT16_16_Q14(autocorr[i],st->lagWindow[i]);
 
-   /* Levinson-Durbin */
-   _spx_lpc(st->lpc, st->autocorr, st->lpcSize);
+      /* Levinson-Durbin */
+      _spx_lpc(lpc, autocorr, st->lpcSize);
+   }
 
    /* LPC to LSPs (x-domain) transform */
-   roots=lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 10, LSP_DELTA1, stack);
+   roots=lpc_to_lsp (lpc, st->lpcSize, lsp, 10, LSP_DELTA1, stack);
    if (roots!=st->lpcSize)
    {
-      roots = lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 10, LSP_DELTA2, stack);
+      roots = lpc_to_lsp (lpc, st->lpcSize, lsp, 10, LSP_DELTA2, stack);
       if (roots!=st->lpcSize) {
          /*If we can't find all LSP's, do some damage control and use a flat filter*/
          for (i=0;i<st->lpcSize;i++)
          {
-            st->lsp[i]=LSP_SCALING*M_PI*((float)(i+1))/(st->lpcSize+1);
+            lsp[i]=st->old_lsp[i];
          }
       }
    }
@@ -465,7 +424,6 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
    /* VBR code */
    if ((st->vbr_enabled || st->vad_enabled) && !dtx)
    {
-      float e_low=0, e_high=0;
       float ratio;
       if (st->abr_enabled)
       {
@@ -487,10 +445,7 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
       }
 
 
-      /*FIXME: Are the two signals (low, high) in sync? */
-      e_low = compute_rms(st->x0d, st->frame_size);
-      e_high = compute_rms(st->high, st->frame_size);
-      ratio = 2*log((1+e_high)/(1+e_low));
+      ratio = 2*log((1.f+e_high)/(1.f+e_low));
       
       speex_encoder_ctl(st->st_low, SPEEX_GET_RELATIVE_QUALITY, &st->relative_quality);
       if (ratio<-4)
@@ -500,7 +455,7 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
       /*if (ratio>-2)*/
       if (st->vbr_enabled) 
       {
-         int modeid;
+         spx_int32_t modeid;
          modeid = mode->nb_modes-1;
          st->relative_quality+=1.0*(ratio+2);
 	 if (st->relative_quality<-1)
@@ -522,7 +477,7 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
          speex_encoder_ctl(state, SPEEX_SET_HIGH_MODE, &modeid);
          if (st->abr_enabled)
          {
-            int bitrate;
+            spx_int32_t bitrate;
             speex_encoder_ctl(state, SPEEX_GET_BITRATE, &bitrate);
             st->abr_drift+=(bitrate-st->abr_enabled);
             st->abr_drift2 = .95*st->abr_drift2 + .05*(bitrate-st->abr_enabled);
@@ -556,23 +511,14 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
    if (dtx || st->submodes[st->submodeID] == NULL)
    {
       for (i=0;i<st->frame_size;i++)
-         st->exc[i]=st->sw[i]=VERY_SMALL;
+         high[i]=VERY_SMALL;
 
       for (i=0;i<st->lpcSize;i++)
          st->mem_sw[i]=0;
       st->first=1;
 
       /* Final signal synthesis from excitation */
-      iir_mem2(st->exc, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, st->mem_sp);
-
-#ifdef RESYNTH
-      /* Reconstruct the original */
-      fir_mem_up(st->x0d, h0, st->y0, st->full_frame_size, QMF_ORDER, st->g0_mem, stack);
-      fir_mem_up(st->high, h1, st->y1, st->full_frame_size, QMF_ORDER, st->g1_mem, stack);
-
-      for (i=0;i<st->full_frame_size;i++)
-         in[i]=SHR(st->y0[i]-st->y1[i], SIG_SHIFT-1);
-#endif
+      iir_mem16(high, st->interp_qlpc, high, st->frame_size, st->lpcSize, st->mem_sp, stack);
 
       if (dtx)
          return 0;
@@ -582,14 +528,14 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
 
 
    /* LSP quantization */
-   SUBMODE(lsp_quant)(st->lsp, st->qlsp, st->lpcSize, bits);   
+   SUBMODE(lsp_quant)(lsp, qlsp, st->lpcSize, bits);   
 
    if (st->first)
    {
       for (i=0;i<st->lpcSize;i++)
-         st->old_lsp[i] = st->lsp[i];
+         st->old_lsp[i] = lsp[i];
       for (i=0;i<st->lpcSize;i++)
-         st->old_qlsp[i] = st->qlsp[i];
+         st->old_qlsp[i] = qlsp[i];
    }
    
    ALLOC(mem, st->lpcSize, spx_mem_t);
@@ -599,37 +545,33 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
 
    for (sub=0;sub<st->nbSubframes;sub++)
    {
-      spx_sig_t *exc, *sp, *res, *sw, *innov_save=NULL;
-      spx_word16_t filter_ratio;
+      VARDECL(spx_word16_t *exc);
+      VARDECL(spx_word16_t *res);
+      VARDECL(spx_word16_t *sw);
+      spx_word16_t *sp;
+      spx_word16_t filter_ratio;     /*Q7*/
       int offset;
-      spx_word32_t rl, rh;
+      spx_word32_t rl, rh;           /*Q13*/
       spx_word16_t eh=0;
 
       offset = st->subframeSize*sub;
-      sp=st->high+offset;
-      exc=st->exc+offset;
-      res=st->res+offset;
-      sw=st->sw+offset;
-      /* Pointer for saving innovation */
-      if (st->innov_save)
-      {
-         innov_save = st->innov_save+2*offset;
-         for (i=0;i<2*st->subframeSize;i++)
-            innov_save[i]=0;
-      }
+      sp=high+offset;
+      ALLOC(exc, st->subframeSize, spx_word16_t);
+      ALLOC(res, st->subframeSize, spx_word16_t);
+      ALLOC(sw, st->subframeSize, spx_word16_t);
       
       /* LSP interpolation (quantized and unquantized) */
-      lsp_interpolate(st->old_lsp, st->lsp, st->interp_lsp, st->lpcSize, sub, st->nbSubframes);
-      lsp_interpolate(st->old_qlsp, st->qlsp, st->interp_qlsp, st->lpcSize, sub, st->nbSubframes);
+      lsp_interpolate(st->old_lsp, lsp, interp_lsp, st->lpcSize, sub, st->nbSubframes);
+      lsp_interpolate(st->old_qlsp, qlsp, interp_qlsp, st->lpcSize, sub, st->nbSubframes);
 
-      lsp_enforce_margin(st->interp_lsp, st->lpcSize, LSP_MARGIN);
-      lsp_enforce_margin(st->interp_qlsp, st->lpcSize, LSP_MARGIN);
+      lsp_enforce_margin(interp_lsp, st->lpcSize, LSP_MARGIN);
+      lsp_enforce_margin(interp_qlsp, st->lpcSize, LSP_MARGIN);
 
-      lsp_to_lpc(st->interp_lsp, st->interp_lpc, st->lpcSize,stack);
-      lsp_to_lpc(st->interp_qlsp, st->interp_qlpc, st->lpcSize, stack);
+      lsp_to_lpc(interp_lsp, interp_lpc, st->lpcSize,stack);
+      lsp_to_lpc(interp_qlsp, st->interp_qlpc, st->lpcSize, stack);
 
-      bw_lpc(st->gamma1, st->interp_lpc, st->bw_lpc1, st->lpcSize);
-      bw_lpc(st->gamma2, st->interp_lpc, st->bw_lpc2, st->lpcSize);
+      bw_lpc(st->gamma1, interp_lpc, bw_lpc1, st->lpcSize);
+      bw_lpc(st->gamma2, interp_lpc, bw_lpc2, st->lpcSize);
 
       /* Compute mid-band (4000 Hz for wideband) response of low-band and high-band
          filters */
@@ -643,24 +585,24 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
       
       rl = low_pi_gain[sub];
 #ifdef FIXED_POINT
-      filter_ratio=PDIV32_16(SHL(rl+82,2),SHR(82+rh,5));
+      filter_ratio=EXTRACT16(SATURATE(PDIV32(SHL32(ADD32(rl,82),7),ADD32(82,rh)),32767));
 #else
       filter_ratio=(rl+.01)/(rh+.01);
 #endif
       
       /* Compute "real excitation" */
-      fir_mem2(sp, st->interp_qlpc, exc, st->subframeSize, st->lpcSize, st->mem_sp2);
+      fir_mem16(sp, st->interp_qlpc, exc, st->subframeSize, st->lpcSize, st->mem_sp2, stack);
       /* Compute energy of low-band and high-band excitation */
 
-      eh = compute_rms(exc, st->subframeSize);
+      eh = compute_rms16(exc, st->subframeSize);
 
       if (!SUBMODE(innovation_quant)) {/* 1 for spectral folding excitation, 0 for stochastic */
-         float g;
-         spx_word16_t el;
-         el = compute_rms(st->low_innov+offset, st->subframeSize);
+         spx_word32_t g;   /*Q7*/
+         spx_word16_t el;  /*Q0*/
+         el = low_innov_rms[sub];
 
          /* Gain to use if we want to use the low-band excitation for high-band */
-         g=eh/(1.+el);
+         g=PDIV32(MULT16_16(filter_ratio,eh),EXTEND32(ADD16(1,el)));
          
 #if 0
          {
@@ -678,15 +620,10 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
          }
 #endif
 
-#ifdef FIXED_POINT
-         g *= filter_ratio/128.;
-#else
-         g *= filter_ratio;
-#endif
          /*print_vec(&g, 1, "gain factor");*/
          /* Gain quantization */
          {
-            int quant = (int) floor(.5 + 10 + 8.0 * log((g+.0001)));
+            int quant = scal_quant(g, fold_quant_bound, 32);
             /*speex_warning_int("tata", quant);*/
             if (quant<0)
                quant=0;
@@ -694,68 +631,57 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
                quant=31;
             speex_bits_pack(bits, quant, 5);
          }
-
+         if (st->innov_rms_save)
+         {
+            st->innov_rms_save[sub] = eh;
+         }
+         st->exc_rms[sub] = eh;
       } else {
-         spx_word16_t gc;
-         spx_word32_t scale;
-         spx_word16_t el;
-         el = compute_rms16(low_exc+offset, st->subframeSize);
+         spx_word16_t gc;       /*Q7*/
+         spx_word32_t scale;    /*Q14*/
+         spx_word16_t el;       /*Q0*/
+         el = low_exc_rms[sub]; /*Q0*/
 
          gc = PDIV32_16(MULT16_16(filter_ratio,1+eh),1+el);
 
          /* This is a kludge that cleans up a historical bug */
          if (st->subframeSize==80)
-            gc *= 0.70711;
+            gc = MULT16_16_P15(QCONST16(0.70711f,15),gc);
          /*printf ("%f %f %f %f\n", el, eh, filter_ratio, gc);*/
-#ifdef FIXED_POINT
          {
             int qgc = scal_quant(gc, gc_quant_bound, 16);
             speex_bits_pack(bits, qgc, 4);
-            gc = MULT16_32_Q15(28626,gc_quant_bound[qgc]);
+            gc = MULT16_16_Q15(QCONST16(0.87360,15),gc_quant_bound[qgc]);
          }
-#else
-         {
-            int qgc = (int)floor(.5+3.7*(log(gc)+0.15556));
-            if (qgc<0)
-               qgc=0;
-            if (qgc>15)
-               qgc=15;
-            speex_bits_pack(bits, qgc, 4);
-            gc = exp((1/3.7)*qgc-0.15556);
-         }         
-#endif
          if (st->subframeSize==80)
-            gc *= 1.4142;
+            gc = MULT16_16_P14(QCONST16(1.4142f,14), gc);
 
          scale = SHL32(MULT16_16(PDIV32_16(SHL32(EXTEND32(gc),SIG_SHIFT-6),filter_ratio),(1+el)),6);
 
-         compute_impulse_response(st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, syn_resp, st->subframeSize, st->lpcSize, stack);
+         compute_impulse_response(st->interp_qlpc, bw_lpc1, bw_lpc2, syn_resp, st->subframeSize, st->lpcSize, stack);
 
          
          /* Reset excitation */
          for (i=0;i<st->subframeSize;i++)
-            exc[i]=VERY_SMALL;
+            res[i]=VERY_SMALL;
          
          /* Compute zero response (ringing) of A(z/g1) / ( A(z/g2) * Aq(z) ) */
          for (i=0;i<st->lpcSize;i++)
             mem[i]=st->mem_sp[i];
-         iir_mem2(exc, st->interp_qlpc, exc, st->subframeSize, st->lpcSize, mem);
+         iir_mem16(res, st->interp_qlpc, res, st->subframeSize, st->lpcSize, mem, stack);
 
          for (i=0;i<st->lpcSize;i++)
             mem[i]=st->mem_sw[i];
-         filter_mem2(exc, st->bw_lpc1, st->bw_lpc2, res, st->subframeSize, st->lpcSize, mem);
+         filter_mem16(res, bw_lpc1, bw_lpc2, res, st->subframeSize, st->lpcSize, mem, stack);
 
          /* Compute weighted signal */
          for (i=0;i<st->lpcSize;i++)
             mem[i]=st->mem_sw[i];
-         filter_mem2(sp, st->bw_lpc1, st->bw_lpc2, sw, st->subframeSize, st->lpcSize, mem);
+         filter_mem16(sp, bw_lpc1, bw_lpc2, sw, st->subframeSize, st->lpcSize, mem, stack);
 
          /* Compute target signal */
          for (i=0;i<st->subframeSize;i++)
-            target[i]=PSHR32(sw[i]-res[i],SIG_SHIFT);
-
-         for (i=0;i<st->subframeSize;i++)
-           exc[i]=0;
+            target[i]=SUB16(sw[i],res[i]);
 
          signal_div(target, target, scale, st->subframeSize);
 
@@ -764,22 +690,13 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
             innov[i]=0;
 
          /*print_vec(target, st->subframeSize, "\ntarget");*/
-         SUBMODE(innovation_quant)(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, 
+         SUBMODE(innovation_quant)(target, st->interp_qlpc, bw_lpc1, bw_lpc2, 
                                    SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
                                    innov, syn_resp, bits, stack, st->complexity, SUBMODE(double_codebook));
          /*print_vec(target, st->subframeSize, "after");*/
 
          signal_mul(innov, innov, scale, st->subframeSize);
 
-         for (i=0;i<st->subframeSize;i++)
-            exc[i] = ADD32(exc[i], innov[i]);
-
-         if (st->innov_save)
-         {
-            for (i=0;i<st->subframeSize;i++)
-               innov_save[2*i]=innov[i];
-         }
-         
          if (SUBMODE(double_codebook)) {
             char *tmp_stack=stack;
             VARDECL(spx_sig_t *innov2);
@@ -787,42 +704,44 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
             for (i=0;i<st->subframeSize;i++)
                innov2[i]=0;
             for (i=0;i<st->subframeSize;i++)
-               target[i]*=2.5;
-            SUBMODE(innovation_quant)(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, 
+               target[i]=MULT16_16_P13(QCONST16(2.5f,13), target[i]);
+
+            SUBMODE(innovation_quant)(target, st->interp_qlpc, bw_lpc1, bw_lpc2, 
                                       SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
                                       innov2, syn_resp, bits, stack, st->complexity, 0);
+            signal_mul(innov2, innov2, MULT16_32_P15(QCONST16(0.4f,15),scale), st->subframeSize);
+
             for (i=0;i<st->subframeSize;i++)
-               innov2[i]*=scale*(1/2.5)/SIG_SCALING;
-            for (i=0;i<st->subframeSize;i++)
-               exc[i] = ADD32(exc[i],innov2[i]);
+               innov[i] = ADD32(innov[i],innov2[i]);
             stack = tmp_stack;
          }
+         for (i=0;i<st->subframeSize;i++)
+            exc[i] = PSHR32(innov[i],SIG_SHIFT);
+
+         if (st->innov_rms_save)
+         {
+            st->innov_rms_save[sub] = MULT16_16_Q15(QCONST16(.70711f, 15), compute_rms(innov, st->subframeSize));
+         }
+         st->exc_rms[sub] = compute_rms16(exc, st->subframeSize);
+         
 
       }
 
+      
       /*Keep the previous memory*/
       for (i=0;i<st->lpcSize;i++)
          mem[i]=st->mem_sp[i];
       /* Final signal synthesis from excitation */
-      iir_mem2(exc, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, st->mem_sp);
+      iir_mem16(exc, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, st->mem_sp, stack);
       
       /* Compute weighted signal again, from synthesized speech (not sure it's the right thing) */
-      filter_mem2(sp, st->bw_lpc1, st->bw_lpc2, sw, st->subframeSize, st->lpcSize, st->mem_sw);
+      filter_mem16(sp, bw_lpc1, bw_lpc2, sw, st->subframeSize, st->lpcSize, st->mem_sw, stack);
    }
 
-
-#ifdef RESYNTH
-   /* Reconstruct the original */
-   fir_mem_up(st->x0d, h0, st->y0, st->full_frame_size, QMF_ORDER, st->g0_mem, stack);
-   fir_mem_up(st->high, h1, st->y1, st->full_frame_size, QMF_ORDER, st->g1_mem, stack);
-
-   for (i=0;i<st->full_frame_size;i++)
-      in[i]=SHR(st->y0[i]-st->y1[i], SIG_SHIFT-1);
-#endif
    for (i=0;i<st->lpcSize;i++)
-      st->old_lsp[i] = st->lsp[i];
+      st->old_lsp[i] = lsp[i];
    for (i=0;i<st->lpcSize;i++)
-      st->old_qlsp[i] = st->qlsp[i];
+      st->old_qlsp[i] = qlsp[i];
 
    st->first=0;
 
@@ -835,26 +754,24 @@ int sb_encode(void *state, void *vin, SpeexBits *bits)
 
 void *sb_decoder_init(const SpeexMode *m)
 {
-   int tmp;
+   spx_int32_t tmp;
    SBDecState *st;
    const SpeexSBMode *mode;
    st = (SBDecState*)speex_alloc(sizeof(SBDecState));
    if (!st)
       return NULL;
-#if defined(VAR_ARRAYS) || defined (USE_ALLOCA)
-   st->stack = NULL;
-#else
-   st->stack = (char*)speex_alloc_scratch(SB_DEC_STACK);
-#endif
    st->mode = m;
    mode=(const SpeexSBMode*)m->mode;
-
    st->encode_submode = 1;
 
-
-
-
    st->st_low = speex_decoder_init(mode->nb_mode);
+#if defined(VAR_ARRAYS) || defined (USE_ALLOCA)
+   st->stack = NULL;
+#else
+   /*st->stack = (char*)speex_alloc_scratch(SB_DEC_STACK);*/
+   speex_decoder_ctl(st->st_low, SPEEX_GET_STACK, &st->stack);
+#endif
+
    st->full_frame_size = 2*mode->frameSize;
    st->frame_size = mode->frameSize;
    st->subframeSize = mode->subframeSize;
@@ -870,29 +787,18 @@ void *sb_decoder_init(const SpeexMode *m)
 
    st->first=1;
 
-
-   st->x0d = (spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->x1d = (spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->high = (spx_sig_t*)speex_alloc((st->full_frame_size)*sizeof(spx_sig_t));
-   st->y0 = (spx_sig_t*)speex_alloc((st->full_frame_size)*sizeof(spx_sig_t));
-   st->y1 = (spx_sig_t*)speex_alloc((st->full_frame_size)*sizeof(spx_sig_t));
-
    st->g0_mem = (spx_word32_t*)speex_alloc((QMF_ORDER)*sizeof(spx_word32_t));
    st->g1_mem = (spx_word32_t*)speex_alloc((QMF_ORDER)*sizeof(spx_word32_t));
 
-   st->exc = (spx_sig_t*)speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->excBuf = (spx_sig_t*)speex_alloc((st->subframeSize)*sizeof(spx_sig_t));
+   st->excBuf = (spx_word16_t*)speex_alloc((st->subframeSize)*sizeof(spx_word16_t));
 
-   st->qlsp = (spx_lsp_t*)speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
    st->old_qlsp = (spx_lsp_t*)speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
-   st->interp_qlsp = (spx_lsp_t*)speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
    st->interp_qlpc = (spx_coef_t*)speex_alloc(st->lpcSize*sizeof(spx_coef_t));
 
    st->pi_gain = (spx_word32_t*)speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
+   st->exc_rms = (spx_word16_t*)speex_alloc((st->nbSubframes)*sizeof(spx_word16_t));
    st->mem_sp = (spx_mem_t*)speex_alloc((2*st->lpcSize)*sizeof(spx_mem_t));
    
-   st->low_innov = (spx_word32_t*)speex_alloc((st->frame_size)*sizeof(spx_word32_t));
-   speex_decoder_ctl(st->st_low, SPEEX_SET_INNOVATION_SAVE, st->low_innov);
    st->innov_save = NULL;
 
 
@@ -911,23 +817,16 @@ void sb_decoder_destroy(void *state)
    st = (SBDecState*)state;
    speex_decoder_destroy(st->st_low);
 #if !(defined(VAR_ARRAYS) || defined (USE_ALLOCA))
-   speex_free_scratch(st->stack);
+   /*speex_free_scratch(st->stack);*/
 #endif
 
-   speex_free(st->x0d);
-   speex_free(st->x1d);
-   speex_free(st->high);
-   speex_free(st->y0);
-   speex_free(st->y1);
    speex_free(st->g0_mem);
    speex_free(st->g1_mem);
-   speex_free(st->exc);
    speex_free(st->excBuf);
-   speex_free(st->qlsp);
    speex_free(st->old_qlsp);
-   speex_free(st->interp_qlsp);
    speex_free(st->interp_qlpc);
    speex_free(st->pi_gain);
+   speex_free(st->exc_rms);
    speex_free(st->mem_sp);
 
    speex_free(state);
@@ -943,7 +842,7 @@ static void sb_decode_lost(SBDecState *st, spx_word16_t *out, int dtx, char *sta
       saved_modeid=st->submodeID;
       st->submodeID=1;
    } else {
-      bw_lpc(GAMMA_SCALING*0.99, st->interp_qlpc, st->interp_qlpc, st->lpcSize);
+      bw_lpc(QCONST16(0.99f,15), st->interp_qlpc, st->interp_qlpc, st->lpcSize);
    }
 
    st->first=1;
@@ -952,25 +851,17 @@ static void sb_decode_lost(SBDecState *st, spx_word16_t *out, int dtx, char *sta
    /* Final signal synthesis from excitation */
    if (!dtx)
    {
-      spx_word16_t low_ener;
-      low_ener = .9*compute_rms(st->exc, st->frame_size);
-      for (i=0;i<st->frame_size;i++)
-         st->exc[i] = speex_rand(low_ener, &st->seed);
+      st->last_ener =  MULT16_16_Q15(QCONST16(.9f,15),st->last_ener);
    }
-
    for (i=0;i<st->frame_size;i++)
-      st->high[i]=st->exc[i];
+      out[i+st->frame_size] = speex_rand(st->last_ener, &st->seed);
 
-   iir_mem2(st->high, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, 
-            st->mem_sp);
+   iir_mem16(out+st->frame_size, st->interp_qlpc, out+st->frame_size, st->frame_size, st->lpcSize, 
+            st->mem_sp, stack);
    
    
    /* Reconstruct the original */
-   fir_mem_up(st->x0d, h0, st->y0, st->full_frame_size, QMF_ORDER, st->g0_mem, stack);
-   fir_mem_up(st->high, h1, st->y1, st->full_frame_size, QMF_ORDER, st->g1_mem, stack);
-
-   mix_and_saturate(st->y0, st->y1, out, st->full_frame_size);
-
+   qmf_synth(out, out+st->frame_size, h0, out, st->full_frame_size, QMF_ORDER, st->g0_mem, st->g1_mem, stack);
    if (dtx)
    {
       st->submodeID=saved_modeid;
@@ -987,26 +878,24 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
    int ret;
    char *stack;
    VARDECL(spx_word32_t *low_pi_gain);
-   VARDECL(spx_word16_t *low_exc);
+   VARDECL(spx_word16_t *low_exc_rms);
    VARDECL(spx_coef_t *ak);
-   int dtx;
+   VARDECL(spx_lsp_t *qlsp);
+   VARDECL(spx_lsp_t *interp_qlsp);
+   spx_int32_t dtx;
    const SpeexSBMode *mode;
    spx_word16_t *out = (spx_word16_t*)vout;
+   spx_word16_t *low_innov_alias;
+   spx_word32_t exc_ener_sum = 0;
    
    st = (SBDecState*)state;
    stack=st->stack;
    mode = (const SpeexSBMode*)(st->mode->mode);
 
-   {
-      VARDECL(spx_word16_t *low);
-      ALLOC(low, st->frame_size, spx_word16_t);
-      
-      /* Decode the low-band */
-      ret = speex_decode_native(st->st_low, bits, low);
-      
-      for (i=0;i<st->frame_size;i++)
-         st->x0d[i] = SHL((spx_sig_t)low[i], SIG_SHIFT);
-   }
+   low_innov_alias = out+st->frame_size;
+   speex_decoder_ctl(st->st_low, SPEEX_SET_INNOVATION_SAVE, low_innov_alias);
+   /* Decode the low-band */
+   ret = speex_decode_native(st->st_low, bits, out);
 
    speex_decoder_ctl(st->st_low, SPEEX_GET_DTX_STATUS, &dtx);
 
@@ -1042,7 +931,7 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
       }
       if (st->submodeID != 0 && st->submodes[st->submodeID] == NULL)
       {
-         speex_warning("Invalid mode encountered: corrupted stream?");
+         speex_notify("Invalid mode encountered. The stream is corrupted.");
          return -2;
       }
    }
@@ -1057,51 +946,49 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
       }
 
       for (i=0;i<st->frame_size;i++)
-         st->exc[i]=VERY_SMALL;
+         out[st->frame_size+i]=VERY_SMALL;
 
       st->first=1;
 
       /* Final signal synthesis from excitation */
-      iir_mem2(st->exc, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, st->mem_sp);
-
-      fir_mem_up(st->x0d, h0, st->y0, st->full_frame_size, QMF_ORDER, st->g0_mem, stack);
-      fir_mem_up(st->high, h1, st->y1, st->full_frame_size, QMF_ORDER, st->g1_mem, stack);
+      iir_mem16(out+st->frame_size, st->interp_qlpc, out+st->frame_size, st->frame_size, st->lpcSize, st->mem_sp, stack);
 
-      mix_and_saturate(st->y0, st->y1, out, st->full_frame_size);
+      qmf_synth(out, out+st->frame_size, h0, out, st->full_frame_size, QMF_ORDER, st->g0_mem, st->g1_mem, stack);
 
       return 0;
 
    }
 
-   for (i=0;i<st->frame_size;i++)
-      st->exc[i]=0;
-
    ALLOC(low_pi_gain, st->nbSubframes, spx_word32_t);
-   ALLOC(low_exc, st->frame_size, spx_word16_t);
+   ALLOC(low_exc_rms, st->nbSubframes, spx_word16_t);
    speex_decoder_ctl(st->st_low, SPEEX_GET_PI_GAIN, low_pi_gain);
-   speex_decoder_ctl(st->st_low, SPEEX_GET_EXC, low_exc);
+   speex_decoder_ctl(st->st_low, SPEEX_GET_EXC, low_exc_rms);
 
-   SUBMODE(lsp_unquant)(st->qlsp, st->lpcSize, bits);
+   ALLOC(qlsp, st->lpcSize, spx_lsp_t);
+   ALLOC(interp_qlsp, st->lpcSize, spx_lsp_t);
+   SUBMODE(lsp_unquant)(qlsp, st->lpcSize, bits);
    
    if (st->first)
    {
       for (i=0;i<st->lpcSize;i++)
-         st->old_qlsp[i] = st->qlsp[i];
+         st->old_qlsp[i] = qlsp[i];
    }
    
    ALLOC(ak, st->lpcSize, spx_coef_t);
 
    for (sub=0;sub<st->nbSubframes;sub++)
    {
-      spx_sig_t *exc, *sp, *innov_save=NULL;
+      VARDECL(spx_word32_t *exc);
+      spx_word16_t *innov_save=NULL;
+      spx_word16_t *sp;
       spx_word16_t filter_ratio;
       spx_word16_t el=0;
       int offset;
       spx_word32_t rl=0,rh=0;
       
       offset = st->subframeSize*sub;
-      sp=st->high+offset;
-      exc=st->exc+offset;
+      sp=out+st->frame_size+offset;
+      ALLOC(exc, st->subframeSize, spx_word32_t);
       /* Pointer for saving innovation */
       if (st->innov_save)
       {
@@ -1111,12 +998,12 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
       }
       
       /* LSP interpolation */
-      lsp_interpolate(st->old_qlsp, st->qlsp, st->interp_qlsp, st->lpcSize, sub, st->nbSubframes);
+      lsp_interpolate(st->old_qlsp, qlsp, interp_qlsp, st->lpcSize, sub, st->nbSubframes);
 
-      lsp_enforce_margin(st->interp_qlsp, st->lpcSize, LSP_MARGIN);
+      lsp_enforce_margin(interp_qlsp, st->lpcSize, LSP_MARGIN);
 
       /* LSP to LPC */
-      lsp_to_lpc(st->interp_qlsp, ak, st->lpcSize, stack);
+      lsp_to_lpc(interp_qlsp, ak, st->lpcSize, stack);
 
       /* Calculate reponse ratio between the low and high filter in the middle
          of the band (4000 Hz) */
@@ -1125,13 +1012,13 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
          rh = LPC_SCALING;
          for (i=0;i<st->lpcSize;i+=2)
          {
-            rh += st->interp_qlpc[i+1] - st->interp_qlpc[i];
-            st->pi_gain[sub] += st->interp_qlpc[i] + st->interp_qlpc[i+1];
+            rh += ak[i+1] - ak[i];
+            st->pi_gain[sub] += ak[i] + ak[i+1];
          }
 
          rl = low_pi_gain[sub];
 #ifdef FIXED_POINT
-         filter_ratio=PDIV32_16(SHL(rl+82,2),SHR(82+rh,5));
+         filter_ratio=EXTRACT16(SATURATE(PDIV32(SHL32(ADD32(rl,82),7),ADD32(82,rh)),32767));
 #else
          filter_ratio=(rl+.01)/(rh+.01);
 #endif
@@ -1140,60 +1027,32 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
          exc[i]=0;
       if (!SUBMODE(innovation_unquant))
       {
-         float g;
+         spx_word32_t g;
          int quant;
 
          quant = speex_bits_unpack_unsigned(bits, 5);
-         g= exp(((float)quant-10)/8.0);
+         g= spx_exp(MULT16_16(QCONST16(.125f,11),(quant-10)));
          
-#ifdef FIXED_POINT
-         g /= filter_ratio/128.;
-#else
-         g /= filter_ratio;
-#endif
-         /* High-band excitation using the low-band excitation and a gain */
+         g = PDIV32(g, filter_ratio);
          
-#if 0
-         for (i=0;i<st->subframeSize;i++)
-            exc[i]=mode->folding_gain*g*st->low_innov[offset+i];
-#else
+         for (i=0;i<st->subframeSize;i+=2)
          {
-            float tmp=1;
-            /*static tmp1=0,tmp2=0;
-            static int seed=1;
-            el = compute_rms(low_innov+offset, st->subframeSize);*/
-            for (i=0;i<st->subframeSize;i++)
-            {
-               float e=tmp*g*mode->folding_gain*st->low_innov[offset+i];
-               tmp *= -1;
-               exc[i] = e;
-               /*float r = speex_rand(g*el,&seed);
-               exc[i] = .5*(r+tmp2 + e-tmp1);
-               tmp1 = e;
-               tmp2 = r;*/               
-            }
-            
+            exc[i]=SHL32(MULT16_32_P15(MULT16_16_Q15(mode->folding_gain,low_innov_alias[offset+i]),SHL32(g,6)),SIG_SHIFT);
+            exc[i+1]=NEG32(SHL32(MULT16_32_P15(MULT16_16_Q15(mode->folding_gain,low_innov_alias[offset+i+1]),SHL32(g,6)),SIG_SHIFT));
          }
          
-#endif    
       } else {
          spx_word16_t gc;
          spx_word32_t scale;
          int qgc = speex_bits_unpack_unsigned(bits, 4);
-
-         el = compute_rms16(low_exc+offset, st->subframeSize);
-
-#ifdef FIXED_POINT
-         gc = MULT16_32_Q15(28626,gc_quant_bound[qgc]);
-#else
-         gc = exp((1/3.7)*qgc-0.15556);
-#endif
+         
+         el = low_exc_rms[sub];
+         gc = MULT16_16_Q15(QCONST16(0.87360,15),gc_quant_bound[qgc]);
 
          if (st->subframeSize==80)
-            gc *= 1.4142;
-
-         scale = SHL(MULT16_16(PDIV32_16(SHL(gc,SIG_SHIFT-6),filter_ratio),(1+el)),6);
+            gc = MULT16_16_P14(QCONST16(1.4142f,14),gc);
 
+         scale = SHL32(PDIV32(SHL32(MULT16_16(gc, el),3), filter_ratio),SIG_SHIFT-3);
          SUBMODE(innovation_unquant)(exc, SUBMODE(innovation_params), st->subframeSize, 
                                      bits, stack, &st->seed);
 
@@ -1207,8 +1066,7 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
                innov2[i]=0;
             SUBMODE(innovation_unquant)(innov2, SUBMODE(innovation_params), st->subframeSize, 
                                         bits, stack, &st->seed);
-            for (i=0;i<st->subframeSize;i++)
-               innov2[i]*=scale/(float)SIG_SCALING*(1/2.5);
+            signal_mul(innov2, innov2, MULT16_32_P15(QCONST16(0.4f,15),scale), st->subframeSize);
             for (i=0;i<st->subframeSize;i++)
                exc[i] = ADD32(exc[i],innov2[i]);
             stack = tmp_stack;
@@ -1219,27 +1077,25 @@ int sb_decode(void *state, SpeexBits *bits, void *vout)
       if (st->innov_save)
       {
          for (i=0;i<st->subframeSize;i++)
-            innov_save[2*i]=exc[i];
+            innov_save[2*i]=EXTRACT16(PSHR32(exc[i],SIG_SHIFT));
       }
       
       for (i=0;i<st->subframeSize;i++)
          sp[i]=st->excBuf[i];
-      iir_mem2(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-               st->mem_sp);
+      iir_mem16(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
+               st->mem_sp, stack);
       for (i=0;i<st->subframeSize;i++)
-         st->excBuf[i]=exc[i];
+         st->excBuf[i]=EXTRACT16(PSHR32(exc[i],SIG_SHIFT));
       for (i=0;i<st->lpcSize;i++)
          st->interp_qlpc[i] = ak[i];
-
+      st->exc_rms[sub] = compute_rms16(st->excBuf, st->subframeSize);
+      exc_ener_sum = ADD32(exc_ener_sum, DIV32(MULT16_16(st->exc_rms[sub],st->exc_rms[sub]), st->nbSubframes));
    }
-
-   fir_mem_up(st->x0d, h0, st->y0, st->full_frame_size, QMF_ORDER, st->g0_mem, stack);
-   fir_mem_up(st->high, h1, st->y1, st->full_frame_size, QMF_ORDER, st->g1_mem, stack);
-
-   mix_and_saturate(st->y0, st->y1, out, st->full_frame_size);
-
+   st->last_ener = spx_sqrt(exc_ener_sum);
+   
+   qmf_synth(out, out+st->frame_size, h0, out, st->full_frame_size, QMF_ORDER, st->g0_mem, st->g1_mem, stack);
    for (i=0;i<st->lpcSize;i++)
-      st->old_qlsp[i] = st->qlsp[i];
+      st->old_qlsp[i] = qlsp[i];
 
    st->first=0;
 
@@ -1254,10 +1110,10 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
    switch(request)
    {
    case SPEEX_GET_FRAME_SIZE:
-      (*(int*)ptr) = st->full_frame_size;
+      (*(spx_int32_t*)ptr) = st->full_frame_size;
       break;
    case SPEEX_SET_HIGH_MODE:
-      st->submodeSelect = st->submodeID = (*(int*)ptr);
+      st->submodeSelect = st->submodeID = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_SET_LOW_MODE:
       speex_encoder_ctl(st->st_low, SPEEX_SET_LOW_MODE, ptr);
@@ -1275,22 +1131,22 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
       speex_encoder_ctl(st, SPEEX_SET_QUALITY, ptr);
       break;
    case SPEEX_SET_VBR:
-      st->vbr_enabled = (*(int*)ptr);
+      st->vbr_enabled = (*(spx_int32_t*)ptr);
       speex_encoder_ctl(st->st_low, SPEEX_SET_VBR, ptr);
       break;
    case SPEEX_GET_VBR:
-      (*(int*)ptr) = st->vbr_enabled;
+      (*(spx_int32_t*)ptr) = st->vbr_enabled;
       break;
    case SPEEX_SET_VAD:
-      st->vad_enabled = (*(int*)ptr);
+      st->vad_enabled = (*(spx_int32_t*)ptr);
       speex_encoder_ctl(st->st_low, SPEEX_SET_VAD, ptr);
       break;
    case SPEEX_GET_VAD:
-      (*(int*)ptr) = st->vad_enabled;
+      (*(spx_int32_t*)ptr) = st->vad_enabled;
       break;
    case SPEEX_SET_VBR_QUALITY:
       {
-         int q;
+         spx_int32_t q;
          float qual = (*(float*)ptr)+.6;
          st->vbr_quality = (*(float*)ptr);
          if (qual>10)
@@ -1311,7 +1167,7 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
       speex_encoder_ctl(st->st_low, SPEEX_SET_VBR, &st->vbr_enabled);
       if (st->vbr_enabled) 
       {
-         int i=10, rate, target;
+         spx_int32_t i=10, rate, target;
          float vbr_qual;
          target = (*(spx_int32_t*)ptr);
          while (i>=0)
@@ -1337,8 +1193,8 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
       break;
    case SPEEX_SET_QUALITY:
       {
-         int nb_qual;
-         int quality = (*(int*)ptr);
+         spx_int32_t nb_qual;
+         int quality = (*(spx_int32_t*)ptr);
          if (quality < 0)
             quality = 0;
          if (quality > 10)
@@ -1350,16 +1206,16 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
       break;
    case SPEEX_SET_COMPLEXITY:
       speex_encoder_ctl(st->st_low, SPEEX_SET_COMPLEXITY, ptr);
-      st->complexity = (*(int*)ptr);
+      st->complexity = (*(spx_int32_t*)ptr);
       if (st->complexity<1)
          st->complexity=1;
       break;
    case SPEEX_GET_COMPLEXITY:
-      (*(int*)ptr) = st->complexity;
+      (*(spx_int32_t*)ptr) = st->complexity;
       break;
    case SPEEX_SET_BITRATE:
       {
-         int i=10;
+         spx_int32_t i=10;
          spx_int32_t rate, target;
          target = (*(spx_int32_t*)ptr);
          while (i>=0)
@@ -1397,25 +1253,23 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
          int i;
          st->first = 1;
          for (i=0;i<st->lpcSize;i++)
-            st->lsp[i]=(M_PI*((float)(i+1)))/(st->lpcSize+1);
+            st->old_lsp[i]=(M_PI*((float)(i+1)))/(st->lpcSize+1);
          for (i=0;i<st->lpcSize;i++)
             st->mem_sw[i]=st->mem_sp[i]=st->mem_sp2[i]=0;
-         for (i=0;i<st->bufSize;i++)
-            st->excBuf[i]=0;
          for (i=0;i<QMF_ORDER;i++)
-            st->h0_mem[i]=st->h1_mem[i]=st->g0_mem[i]=st->g1_mem[i]=0;
+            st->h0_mem[i]=st->h1_mem[i]=0;
       }
       break;
    case SPEEX_SET_SUBMODE_ENCODING:
-      st->encode_submode = (*(int*)ptr);
-      speex_encoder_ctl(st->st_low, SPEEX_SET_SUBMODE_ENCODING, &ptr);
+      st->encode_submode = (*(spx_int32_t*)ptr);
+      speex_encoder_ctl(st->st_low, SPEEX_SET_SUBMODE_ENCODING, ptr);
       break;
    case SPEEX_GET_SUBMODE_ENCODING:
-      (*(int*)ptr) = st->encode_submode;
+      (*(spx_int32_t*)ptr) = st->encode_submode;
       break;
    case SPEEX_GET_LOOKAHEAD:
       speex_encoder_ctl(st->st_low, SPEEX_GET_LOOKAHEAD, ptr);
-      (*(int*)ptr) = 2*(*(int*)ptr) + QMF_ORDER - 1;
+      (*(spx_int32_t*)ptr) = 2*(*(spx_int32_t*)ptr) + QMF_ORDER - 1;
       break;
    case SPEEX_SET_PLC_TUNING:
       speex_encoder_ctl(st->st_low, SPEEX_SET_PLC_TUNING, ptr);
@@ -1474,33 +1328,22 @@ int sb_encoder_ctl(void *state, int request, void *ptr)
    case SPEEX_GET_EXC:
       {
          int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
-         for (i=0;i<st->full_frame_size;i++)
-            e[i]=0;
-         for (i=0;i<st->frame_size;i++)
-            e[2*i]=2*st->exc[i];
-      }
-      break;
-   case SPEEX_GET_INNOV:
-      {
-         int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
-         for (i=0;i<st->full_frame_size;i++)
-            e[i]=0;
-         for (i=0;i<st->frame_size;i++)
-            e[2*i]=2*st->exc[i];
+         for (i=0;i<st->nbSubframes;i++)
+            ((spx_word16_t*)ptr)[i] = st->exc_rms[i];
       }
       break;
    case SPEEX_GET_RELATIVE_QUALITY:
       (*(float*)ptr)=st->relative_quality;
       break;
    case SPEEX_SET_INNOVATION_SAVE:
-      st->innov_save = (spx_sig_t*)ptr;
+      st->innov_rms_save = (spx_word16_t*)ptr;
       break;
    case SPEEX_SET_WIDEBAND:
       speex_encoder_ctl(st->st_low, SPEEX_SET_WIDEBAND, ptr);
       break;
-
+   case SPEEX_GET_STACK:
+      *((char**)ptr) = st->stack;
+      break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
       return -1;
@@ -1515,7 +1358,7 @@ int sb_decoder_ctl(void *state, int request, void *ptr)
    switch(request)
    {
    case SPEEX_SET_HIGH_MODE:
-      st->submodeID = (*(int*)ptr);
+      st->submodeID = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_SET_LOW_MODE:
       speex_decoder_ctl(st->st_low, SPEEX_SET_LOW_MODE, ptr);
@@ -1524,20 +1367,20 @@ int sb_decoder_ctl(void *state, int request, void *ptr)
       speex_decoder_ctl(st->st_low, SPEEX_GET_LOW_MODE, ptr);
       break;
    case SPEEX_GET_FRAME_SIZE:
-      (*(int*)ptr) = st->full_frame_size;
+      (*(spx_int32_t*)ptr) = st->full_frame_size;
       break;
    case SPEEX_SET_ENH:
       speex_decoder_ctl(st->st_low, request, ptr);
-      st->lpc_enh_enabled = *((int*)ptr);
+      st->lpc_enh_enabled = *((spx_int32_t*)ptr);
       break;
    case SPEEX_GET_ENH:
-      *((int*)ptr) = st->lpc_enh_enabled;
+      *((spx_int32_t*)ptr) = st->lpc_enh_enabled;
       break;
    case SPEEX_SET_MODE:
    case SPEEX_SET_QUALITY:
       {
-         int nb_qual;
-         int quality = (*(int*)ptr);
+         spx_int32_t nb_qual;
+         int quality = (*(spx_int32_t*)ptr);
          if (quality < 0)
             quality = 0;
          if (quality > 10)
@@ -1578,18 +1421,19 @@ int sb_decoder_ctl(void *state, int request, void *ptr)
             st->mem_sp[i]=0;
          for (i=0;i<QMF_ORDER;i++)
             st->g0_mem[i]=st->g1_mem[i]=0;
+         st->last_ener=0;
       }
       break;
    case SPEEX_SET_SUBMODE_ENCODING:
-      st->encode_submode = (*(int*)ptr);
-      speex_decoder_ctl(st->st_low, SPEEX_SET_SUBMODE_ENCODING, &ptr);
+      st->encode_submode = (*(spx_int32_t*)ptr);
+      speex_decoder_ctl(st->st_low, SPEEX_SET_SUBMODE_ENCODING, ptr);
       break;
    case SPEEX_GET_SUBMODE_ENCODING:
-      (*(int*)ptr) = st->encode_submode;
+      (*(spx_int32_t*)ptr) = st->encode_submode;
       break;
    case SPEEX_GET_LOOKAHEAD:
       speex_decoder_ctl(st->st_low, SPEEX_GET_LOOKAHEAD, ptr);
-      (*(int*)ptr) = 2*(*(int*)ptr);
+      (*(spx_int32_t*)ptr) = 2*(*(spx_int32_t*)ptr);
       break;
    case SPEEX_SET_HIGHPASS:
       speex_decoder_ctl(st->st_low, SPEEX_SET_HIGHPASS, ptr);
@@ -1609,33 +1453,22 @@ int sb_decoder_ctl(void *state, int request, void *ptr)
    case SPEEX_GET_EXC:
       {
          int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
-         for (i=0;i<st->full_frame_size;i++)
-            e[i]=0;
-         for (i=0;i<st->frame_size;i++)
-            e[2*i]=2*st->exc[i];
-      }
-      break;
-   case SPEEX_GET_INNOV:
-      {
-         int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
-         for (i=0;i<st->full_frame_size;i++)
-            e[i]=0;
-         for (i=0;i<st->frame_size;i++)
-            e[2*i]=2*st->exc[i];
+         for (i=0;i<st->nbSubframes;i++)
+            ((spx_word16_t*)ptr)[i] = st->exc_rms[i];
       }
       break;
    case SPEEX_GET_DTX_STATUS:
       speex_decoder_ctl(st->st_low, SPEEX_GET_DTX_STATUS, ptr);
       break;
    case SPEEX_SET_INNOVATION_SAVE:
-      st->innov_save = (spx_sig_t*)ptr;
+      st->innov_save = (spx_word16_t*)ptr;
       break;
    case SPEEX_SET_WIDEBAND:
       speex_decoder_ctl(st->st_low, SPEEX_SET_WIDEBAND, ptr);
       break;
-
+   case SPEEX_GET_STACK:
+      *((char**)ptr) = st->stack;
+      break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
       return -1;
diff --git a/libspeex/sb_celp.h b/libspeex/sb_celp.h
index 4da03e4..a0dc3af 100644
--- a/libspeex/sb_celp.h
+++ b/libspeex/sb_celp.h
@@ -58,37 +58,21 @@ typedef struct SBEncState {
    spx_word16_t  gamma2;          /**< Perceptual weighting coef 2 */
 
    char  *stack;                  /**< Temporary allocation stack */
-   spx_sig_t *x0d, *x1d;          /**< QMF filter signals*/
-   spx_sig_t *high;               /**< High-band signal (buffer) */
-   spx_sig_t *y0, *y1;            /**< QMF synthesis signals */
+   spx_word16_t *high;               /**< High-band signal (buffer) */
    spx_word16_t *h0_mem, *h1_mem;
-   spx_word32_t *g0_mem, *g1_mem; /**< QMF memories */
 
-   spx_sig_t *excBuf;             /**< High-band excitation */
-   spx_sig_t *exc;                /**< High-band excitation (for QMF only)*/
-   spx_sig_t *res;                /**< Zero-input response (ringing) */
-   spx_sig_t *sw;                 /**< Perceptually weighted signal */
    const spx_word16_t *window;    /**< LPC analysis window */
    spx_word16_t *lagWindow;       /**< Auto-correlation window */
-   spx_word16_t *autocorr;        /**< Auto-correlation (for LPC analysis) */
-   spx_coef_t *lpc;               /**< LPC coefficients */
-   spx_lsp_t *lsp;                /**< LSP coefficients */
-   spx_lsp_t *qlsp;               /**< Quantized LSPs */
    spx_lsp_t *old_lsp;            /**< LSPs of previous frame */
    spx_lsp_t *old_qlsp;           /**< Quantized LSPs of previous frame */
-   spx_lsp_t *interp_lsp;         /**< Interpolated LSPs for current sub-frame */
-   spx_lsp_t *interp_qlsp;        /**< Interpolated quantized LSPs for current sub-frame */
-   spx_coef_t *interp_lpc;        /**< Interpolated LPCs for current sub-frame */
    spx_coef_t *interp_qlpc;       /**< Interpolated quantized LPCs for current sub-frame */
-   spx_coef_t *bw_lpc1;           /**< Bandwidth-expanded version of LPCs (#1) */
-   spx_coef_t *bw_lpc2;           /**< Bandwidth-expanded version of LPCs (#2) */
 
    spx_mem_t *mem_sp;             /**< Synthesis signal memory */
    spx_mem_t *mem_sp2;
    spx_mem_t *mem_sw;             /**< Perceptual signal memory */
    spx_word32_t *pi_gain;
-   spx_sig_t *innov_save;         /**< If non-NULL, innovation is copied here */
-   spx_sig_t *low_innov;          /**< Lower-band innovation is copied here magically */
+   spx_word16_t *exc_rms;
+   spx_word16_t *innov_rms_save;         /**< If non-NULL, innovation is copied here */
 
    float  vbr_quality;            /**< Quality setting for VBR encoding */
    int    vbr_enabled;            /**< 1 for enabling VBR, 0 otherwise */
@@ -125,23 +109,18 @@ typedef struct SBDecState {
    int    lpc_enh_enabled;
 
    char  *stack;
-   spx_sig_t *x0d, *x1d;
-   spx_sig_t *high;
-   spx_sig_t *y0, *y1;
    spx_word32_t *g0_mem, *g1_mem;
 
-   spx_sig_t *exc;
-   spx_sig_t *excBuf;
-   spx_lsp_t *qlsp;
+   spx_word16_t *excBuf;
    spx_lsp_t *old_qlsp;
-   spx_lsp_t *interp_qlsp;
    spx_coef_t *interp_qlpc;
 
    spx_mem_t *mem_sp;
    spx_word32_t *pi_gain;
-   spx_sig_t *innov_save;      /** If non-NULL, innovation is copied here */
-   spx_sig_t *low_innov;       /** Lower-band innovation is copied here magically */
+   spx_word16_t *exc_rms;
+   spx_word16_t *innov_save;      /** If non-NULL, innovation is copied here */
    
+   spx_word16_t last_ener;
    spx_int32_t seed;
 
    int    encode_submode;
diff --git a/libspeex/speex.c b/libspeex/speex.c
index 94829e6..846e021 100644
--- a/libspeex/speex.c
+++ b/libspeex/speex.c
@@ -86,7 +86,7 @@ int speex_decode_native(void *state, SpeexBits *bits, spx_word16_t *out)
 int speex_encode(void *state, float *in, SpeexBits *bits)
 {
    int i;
-   int N;
+   spx_int32_t N;
    spx_int16_t short_in[MAX_IN_SAMPLES];
    speex_encoder_ctl(state, SPEEX_GET_FRAME_SIZE, &N);
    for (i=0;i<N;i++)
@@ -111,7 +111,7 @@ int speex_encode_int(void *state, spx_int16_t *in, SpeexBits *bits)
 int speex_decode(void *state, SpeexBits *bits, float *out)
 {
    int i, ret;
-   int N;
+   spx_int32_t N;
    spx_int16_t short_out[MAX_IN_SAMPLES];
    speex_decoder_ctl(state, SPEEX_GET_FRAME_SIZE, &N);
    ret = (*((SpeexMode**)state))->dec(state, bits, short_out);
@@ -136,7 +136,7 @@ int speex_encode(void *state, float *in, SpeexBits *bits)
 int speex_encode_int(void *state, spx_int16_t *in, SpeexBits *bits)
 {
    int i;
-   int N;
+   spx_int32_t N;
    float float_in[MAX_IN_SAMPLES];
    speex_encoder_ctl(state, SPEEX_GET_FRAME_SIZE, &N);
    for (i=0;i<N;i++)
@@ -152,7 +152,7 @@ int speex_decode(void *state, SpeexBits *bits, float *out)
 int speex_decode_int(void *state, SpeexBits *bits, spx_int16_t *out)
 {
    int i;
-   int N;
+   spx_int32_t N;
    float float_out[MAX_IN_SAMPLES];
    int ret;
    speex_decoder_ctl(state, SPEEX_GET_FRAME_SIZE, &N);
diff --git a/libspeex/speex_callbacks.c b/libspeex/speex_callbacks.c
index 0b99188..682322e 100644
--- a/libspeex/speex_callbacks.c
+++ b/libspeex/speex_callbacks.c
@@ -73,7 +73,7 @@ int speex_inband_handler(SpeexBits *bits, SpeexCallback *callback_list, void *st
 
 int speex_std_mode_request_handler(SpeexBits *bits, void *state, void *data)
 {
-   int m;
+   spx_int32_t m;
    m = speex_bits_unpack_unsigned(bits, 4);
    speex_encoder_ctl(data, SPEEX_SET_MODE, &m);
    return 0;
@@ -81,7 +81,7 @@ int speex_std_mode_request_handler(SpeexBits *bits, void *state, void *data)
 
 int speex_std_low_mode_request_handler(SpeexBits *bits, void *state, void *data)
 {
-   int m;
+   spx_int32_t m;
    m = speex_bits_unpack_unsigned(bits, 4);
    speex_encoder_ctl(data, SPEEX_SET_LOW_MODE, &m);
    return 0;
@@ -89,7 +89,7 @@ int speex_std_low_mode_request_handler(SpeexBits *bits, void *state, void *data)
 
 int speex_std_high_mode_request_handler(SpeexBits *bits, void *state, void *data)
 {
-   int m;
+   spx_int32_t m;
    m = speex_bits_unpack_unsigned(bits, 4);
    speex_encoder_ctl(data, SPEEX_SET_HIGH_MODE, &m);
    return 0;
@@ -97,7 +97,7 @@ int speex_std_high_mode_request_handler(SpeexBits *bits, void *state, void *data
 
 int speex_std_vbr_request_handler(SpeexBits *bits, void *state, void *data)
 {
-   int vbr;
+   spx_int32_t vbr;
    vbr = speex_bits_unpack_unsigned(bits, 1);
    speex_encoder_ctl(data, SPEEX_SET_VBR, &vbr);
    return 0;
@@ -105,7 +105,7 @@ int speex_std_vbr_request_handler(SpeexBits *bits, void *state, void *data)
 
 int speex_std_enh_request_handler(SpeexBits *bits, void *state, void *data)
 {
-   int enh;
+   spx_int32_t enh;
    enh = speex_bits_unpack_unsigned(bits, 1);
    speex_decoder_ctl(data, SPEEX_SET_ENH, &enh);
    return 0;
@@ -113,7 +113,7 @@ int speex_std_enh_request_handler(SpeexBits *bits, void *state, void *data)
 
 int speex_std_vbr_quality_request_handler(SpeexBits *bits, void *state, void *data)
 {
-   int qual;
+   float qual;
    qual = speex_bits_unpack_unsigned(bits, 4);
    speex_encoder_ctl(data, SPEEX_SET_VBR_QUALITY, &qual);
    return 0;
diff --git a/libspeex/speex_header.c b/libspeex/speex_header.c
index 7fc2f5a..8e10851 100644
--- a/libspeex/speex_header.c
+++ b/libspeex/speex_header.c
@@ -133,14 +133,14 @@ SpeexHeader *speex_packet_to_header(char *packet, int size)
    for (i=0;i<8;i++)
       if (packet[i]!=h[i])
       {
-         speex_warning ("This doesn't look like a Speex file");
+         speex_notify("This doesn't look like a Speex file");
          return NULL;
       }
    
    /*FIXME: Do we allow larger headers?*/
    if (size < (int)sizeof(SpeexHeader))
    {
-      speex_warning("Speex header too small");
+      speex_notify("Speex header too small");
       return NULL;
    }
    
diff --git a/libspeex/testdenoise.c b/libspeex/testdenoise.c
index 177227d..42644cb 100644
--- a/libspeex/testdenoise.c
+++ b/libspeex/testdenoise.c
@@ -24,9 +24,9 @@ int main()
    speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_AGC_LEVEL, &f);
    i=0;
    speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DEREVERB, &i);
-   f=.4;
+   f=.0;
    speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DEREVERB_DECAY, &f);
-   f=.3;
+   f=.0;
    speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DEREVERB_LEVEL, &f);
    while (1)
    {
@@ -34,7 +34,7 @@ int main()
       fread(in, sizeof(short), NN, stdin);
       if (feof(stdin))
          break;
-      vad = speex_preprocess(st, in, NULL);
+      vad = speex_preprocess_run(st, in);
       /*fprintf (stderr, "%d\n", vad);*/
       fwrite(in, sizeof(short), NN, stdout);
       count++;
diff --git a/libspeex/testecho.c b/libspeex/testecho.c
index fc5bf34..60d76d5 100644
--- a/libspeex/testecho.c
+++ b/libspeex/testecho.c
@@ -18,7 +18,6 @@
 int main(int argc, char **argv)
 {
    int echo_fd, ref_fd, e_fd;
-   spx_int32_t noise[NN+1];
    short echo_buf[NN], ref_buf[NN], e_buf[NN];
    SpeexEchoState *st;
    SpeexPreprocessState *den;
@@ -36,12 +35,13 @@ int main(int argc, char **argv)
    den = speex_preprocess_state_init(NN, 8000);
    int tmp = 8000;
    mc_echo_ctl(st, SPEEX_ECHO_SET_SAMPLING_RATE, &tmp);
+   speex_preprocess_ctl(den, SPEEX_PREPROCESS_SET_ECHO_STATE, st);
 
    while (read(ref_fd, ref_buf, NN*2))
    {
       read(echo_fd, echo_buf, NN*2);
-      mc_echo_cancel(st, ref_buf, echo_buf, e_buf, noise);
-      /*speex_preprocess(den, e_buf, noise);*/
+      mc_echo_cancellation(st, ref_buf, echo_buf, e_buf);
+      speex_preprocess_run(den, e_buf);
       write(e_fd, e_buf, NN*2);
    }
    mc_echo_state_destroy(st);
diff --git a/libspeex/testenc.c b/libspeex/testenc.c
index a7ad409..eabd02c 100644
--- a/libspeex/testenc.c
+++ b/libspeex/testenc.c
@@ -27,9 +27,9 @@ int main(int argc, char **argv)
    void *st;
    void *dec;
    SpeexBits bits;
-   int tmp;
+   spx_int32_t tmp;
    int bitCount=0;
-   int skip_group_delay;
+   spx_int32_t skip_group_delay;
    SpeexCallback callback;
 
    sigpow = 0;
diff --git a/libspeex/testenc_uwb.c b/libspeex/testenc_uwb.c
index 7512336..e9bf18a 100644
--- a/libspeex/testenc_uwb.c
+++ b/libspeex/testenc_uwb.c
@@ -28,9 +28,9 @@ int main(int argc, char **argv)
    void *st;
    void *dec;
    SpeexBits bits;
-   int tmp;
+   spx_int32_t tmp;
    int bitCount=0;
-   int skip_group_delay;
+   spx_int32_t skip_group_delay;
    SpeexCallback callback;
 
    sigpow = 0;
diff --git a/libspeex/testenc_wb.c b/libspeex/testenc_wb.c
index 7a19189..8e515cb 100644
--- a/libspeex/testenc_wb.c
+++ b/libspeex/testenc_wb.c
@@ -28,9 +28,9 @@ int main(int argc, char **argv)
    void *st;
    void *dec;
    SpeexBits bits;
-   int tmp;
+   spx_int32_t tmp;
    int bitCount=0;
-   int skip_group_delay;
+   spx_int32_t skip_group_delay;
    SpeexCallback callback;
 
    sigpow = 0;
diff --git a/libspeex/testresample.c b/libspeex/testresample.c
new file mode 100644
index 0000000..71392cc
--- /dev/null
+++ b/libspeex/testresample.c
@@ -0,0 +1,86 @@
+/* Copyright (C) 2007 Jean-Marc Valin
+      
+   File: testresample.c
+   Testing the resampling code
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "speex/speex_resampler.h"
+#include <math.h>
+#include <stdlib.h>
+
+#define NN 256
+
+int main()
+{
+   spx_uint32_t i;
+   short *in;
+   short *out;
+   float *fin, *fout;
+   int count = 0;
+   SpeexResamplerState *st = speex_resampler_init(1, 8000, 12000, 10, NULL);
+   speex_resampler_set_rate(st, 96000, 44100);
+   speex_resampler_skip_zeros(st);
+   
+   in = malloc(NN*sizeof(short));
+   out = malloc(2*NN*sizeof(short));
+   fin = malloc(NN*sizeof(float));
+   fout = malloc(2*NN*sizeof(float));
+   while (1)
+   {
+      spx_uint32_t in_len;
+      spx_uint32_t out_len;
+      fread(in, sizeof(short), NN, stdin);
+      if (feof(stdin))
+         break;
+      for (i=0;i<NN;i++)
+         fin[i]=in[i];
+      in_len = NN;
+      out_len = 2*NN;
+      /*if (count==2)
+         speex_resampler_set_quality(st, 10);*/
+      speex_resampler_process_float(st, 0, fin, &in_len, fout, &out_len);
+      for (i=0;i<out_len;i++)
+         out[i]=floor(.5+fout[i]);
+      /*speex_warning_int("writing", out_len);*/
+      fwrite(out, sizeof(short), out_len, stdout);
+      count++;
+   }
+   speex_resampler_destroy(st);
+   free(in);
+   free(out);
+   free(fin);
+   free(fout);
+   return 0;
+}
+
diff --git a/libspeex/vbr.c b/libspeex/vbr.c
index bfd1fa6..d24ec0f 100644
--- a/libspeex/vbr.c
+++ b/libspeex/vbr.c
@@ -47,29 +47,29 @@
 
 
 const float vbr_nb_thresh[9][11]={
-   {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0}, /*   CNG   */
-   { 3.5,  2.5,  2.0,  1.2,  0.5,  0.0, -0.5, -0.7, -0.8, -0.9, -1.0}, /*  2 kbps */
-   {10.0,  6.5,  5.2,  4.5,  3.9,  3.5,  3.0,  2.5,  2.3,  1.8,  1.0}, /*  6 kbps */
-   {11.0,  8.8,  7.5,  6.5,  5.0,  3.9,  3.9,  3.9,  3.5,  3.0,  1.0}, /*  8 kbps */
-   {11.0, 11.0,  9.9,  9.0,  8.0,  7.0,  6.5,  6.0,  5.0,  4.0,  2.0}, /* 11 kbps */
-   {11.0, 11.0, 11.0, 11.0,  9.5,  9.0,  8.0,  7.0,  6.5,  5.0,  3.0}, /* 15 kbps */
-   {11.0, 11.0, 11.0, 11.0, 11.0, 11.0,  9.5,  8.5,  8.0,  6.5,  4.0}, /* 18 kbps */
-   {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0,  9.8,  7.5,  5.5}, /* 24 kbps */ 
-   { 8.0,  5.0,  3.7,  3.0,  2.5,  2.0,  1.8,  1.5,  1.0,  0.0,  0.0}  /*  4 kbps */
+   {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}, /*   CNG   */
+   { 3.5f,  2.5f,  2.0f,  1.2f,  0.5f,  0.0f, -0.5f, -0.7f, -0.8f, -0.9f, -1.0f}, /*  2 kbps */
+   {10.0f,  6.5f,  5.2f,  4.5f,  3.9f,  3.5f,  3.0f,  2.5f,  2.3f,  1.8f,  1.0f}, /*  6 kbps */
+   {11.0f,  8.8f,  7.5f,  6.5f,  5.0f,  3.9f,  3.9f,  3.9f,  3.5f,  3.0f,  1.0f}, /*  8 kbps */
+   {11.0f, 11.0f,  9.9f,  9.0f,  8.0f,  7.0f,  6.5f,  6.0f,  5.0f,  4.0f,  2.0f}, /* 11 kbps */
+   {11.0f, 11.0f, 11.0f, 11.0f,  9.5f,  9.0f,  8.0f,  7.0f,  6.5f,  5.0f,  3.0f}, /* 15 kbps */
+   {11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f,  9.5f,  8.5f,  8.0f,  6.5f,  4.0f}, /* 18 kbps */
+   {11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f,  9.8f,  7.5f,  5.5f}, /* 24 kbps */ 
+   { 8.0f,  5.0f,  3.7f,  3.0f,  2.5f,  2.0f,  1.8f,  1.5f,  1.0f,  0.0f,  0.0f}  /*  4 kbps */
 };
 
 
 const float vbr_hb_thresh[5][11]={
-   {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0}, /* silence */
-   {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0}, /*  2 kbps */
-   {11.0, 11.0,  9.5,  8.5,  7.5,  6.0,  5.0,  3.9,  3.0,  2.0,  1.0}, /*  6 kbps */
-   {11.0, 11.0, 11.0, 11.0, 11.0,  9.5,  8.7,  7.8,  7.0,  6.5,  4.0}, /* 10 kbps */
-   {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0,  9.8,  7.5,  5.5}  /* 18 kbps */ 
+   {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}, /* silence */
+   {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}, /*  2 kbps */
+   {11.0f, 11.0f,  9.5f,  8.5f,  7.5f,  6.0f,  5.0f,  3.9f,  3.0f,  2.0f,  1.0f}, /*  6 kbps */
+   {11.0f, 11.0f, 11.0f, 11.0f, 11.0f,  9.5f,  8.7f,  7.8f,  7.0f,  6.5f,  4.0f}, /* 10 kbps */
+   {11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f, 11.0f,  9.8f,  7.5f,  5.5f}  /* 18 kbps */ 
 };
 
 const float vbr_uhb_thresh[2][11]={
-   {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0}, /* silence */
-   { 3.9,  2.5,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0, -1.0}  /*  2 kbps */
+   {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}, /* silence */
+   { 3.9f,  2.5f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f, -1.0f}  /*  2 kbps */
 };
 
 void vbr_init(VBRState *vbr)
diff --git a/libspeex/window.c b/libspeex/window.c
index 3748f65..65b1917 100644
--- a/libspeex/window.c
+++ b/libspeex/window.c
@@ -65,30 +65,30 @@ const spx_word16_t lpc_window[200] = {
 };
 #else
 const spx_word16_t lpc_window[200] = {
-   0.080000, 0.080158, 0.080630, 0.081418, 0.082520, 0.083935, 0.085663, 0.087703,
-   0.090052, 0.092710, 0.095674, 0.098943, 0.102514, 0.106385, 0.110553, 0.115015,
-   0.119769, 0.124811, 0.130137, 0.135744, 0.141628, 0.147786, 0.154212, 0.160902,
-   0.167852, 0.175057, 0.182513, 0.190213, 0.198153, 0.206328, 0.214731, 0.223357,
-   0.232200, 0.241254, 0.250513, 0.259970, 0.269619, 0.279453, 0.289466, 0.299651,
-   0.310000, 0.320507, 0.331164, 0.341965, 0.352901, 0.363966, 0.375151, 0.386449,
-   0.397852, 0.409353, 0.420943, 0.432615, 0.444361, 0.456172, 0.468040, 0.479958,
-   0.491917, 0.503909, 0.515925, 0.527959, 0.540000, 0.552041, 0.564075, 0.576091,
-   0.588083, 0.600042, 0.611960, 0.623828, 0.635639, 0.647385, 0.659057, 0.670647,
-   0.682148, 0.693551, 0.704849, 0.716034, 0.727099, 0.738035, 0.748836, 0.759493,
-   0.770000, 0.780349, 0.790534, 0.800547, 0.810381, 0.820030, 0.829487, 0.838746,
-   0.847800, 0.856643, 0.865269, 0.873672, 0.881847, 0.889787, 0.897487, 0.904943,
-   0.912148, 0.919098, 0.925788, 0.932214, 0.938372, 0.944256, 0.949863, 0.955189,
-   0.960231, 0.964985, 0.969447, 0.973615, 0.977486, 0.981057, 0.984326, 0.987290,
-   0.989948, 0.992297, 0.994337, 0.996065, 0.997480, 0.998582, 0.999370, 0.999842,
-   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-   1.000000, 1.000000, 1.000000, 0.998640, 0.994566, 0.987787, 0.978324, 0.966203,
-   0.951458, 0.934131, 0.914270, 0.891931, 0.867179, 0.840084, 0.810723, 0.779182,
-   0.745551, 0.709930, 0.672424, 0.633148, 0.592223, 0.549781, 0.505964, 0.460932,
-   0.414863, 0.367968, 0.320511, 0.272858, 0.225569, 0.179655, 0.137254, 0.103524
+   0.080000f, 0.080158f, 0.080630f, 0.081418f, 0.082520f, 0.083935f, 0.085663f, 0.087703f,
+   0.090052f, 0.092710f, 0.095674f, 0.098943f, 0.102514f, 0.106385f, 0.110553f, 0.115015f,
+   0.119769f, 0.124811f, 0.130137f, 0.135744f, 0.141628f, 0.147786f, 0.154212f, 0.160902f,
+   0.167852f, 0.175057f, 0.182513f, 0.190213f, 0.198153f, 0.206328f, 0.214731f, 0.223357f,
+   0.232200f, 0.241254f, 0.250513f, 0.259970f, 0.269619f, 0.279453f, 0.289466f, 0.299651f,
+   0.310000f, 0.320507f, 0.331164f, 0.341965f, 0.352901f, 0.363966f, 0.375151f, 0.386449f,
+   0.397852f, 0.409353f, 0.420943f, 0.432615f, 0.444361f, 0.456172f, 0.468040f, 0.479958f,
+   0.491917f, 0.503909f, 0.515925f, 0.527959f, 0.540000f, 0.552041f, 0.564075f, 0.576091f,
+   0.588083f, 0.600042f, 0.611960f, 0.623828f, 0.635639f, 0.647385f, 0.659057f, 0.670647f,
+   0.682148f, 0.693551f, 0.704849f, 0.716034f, 0.727099f, 0.738035f, 0.748836f, 0.759493f,
+   0.770000f, 0.780349f, 0.790534f, 0.800547f, 0.810381f, 0.820030f, 0.829487f, 0.838746f,
+   0.847800f, 0.856643f, 0.865269f, 0.873672f, 0.881847f, 0.889787f, 0.897487f, 0.904943f,
+   0.912148f, 0.919098f, 0.925788f, 0.932214f, 0.938372f, 0.944256f, 0.949863f, 0.955189f,
+   0.960231f, 0.964985f, 0.969447f, 0.973615f, 0.977486f, 0.981057f, 0.984326f, 0.987290f,
+   0.989948f, 0.992297f, 0.994337f, 0.996065f, 0.997480f, 0.998582f, 0.999370f, 0.999842f,
+   1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+   1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+   1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+   1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+   1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+   1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+   1.000000f, 1.000000f, 1.000000f, 0.998640f, 0.994566f, 0.987787f, 0.978324f, 0.966203f,
+   0.951458f, 0.934131f, 0.914270f, 0.891931f, 0.867179f, 0.840084f, 0.810723f, 0.779182f,
+   0.745551f, 0.709930f, 0.672424f, 0.633148f, 0.592223f, 0.549781f, 0.505964f, 0.460932f,
+   0.414863f, 0.367968f, 0.320511f, 0.272858f, 0.225569f, 0.179655f, 0.137254f, 0.103524f
 };
 #endif
author	Jean-Marc Valin <Jean-Marc.Valin@csiro.au>	2007-05-04 09:11:18 +0400
committer	Jean-Marc Valin <Jean-Marc.Valin@csiro.au>	2008-05-19 08:53:14 +0400
commit	3d7a6f0bd0a60145d8ac3a2f4037da623f407fba (patch)
tree	f90d32540ec3269ef8a405a2e97daaf2f83ffcab /libspeex
parent	6bd022014a21ecca9c27d6041397009a5933ac39 (diff)
parent	d2cddf7e2f3c1a75265c43cabaa391037c830745 (diff)