Merging Trimedia support contributed by Hong Zhiqian (incomplete for now)

git-svn-id: http://svn.xiph.org/trunk/speex@12764 0101bb08-14d6-0310-b084-bc0e0c8e3800
author: jm <jm@0101bb08-14d6-0310-b084-bc0e0c8e3800> 2007-03-15 16:13:25 +0300
committer: jm <jm@0101bb08-14d6-0310-b084-bc0e0c8e3800> 2007-03-15 16:13:25 +0300
commit: 4c7f0a5ffd7ea5e52968aad1e89108e124be012b (patch)
tree: 0c5858c0c5b1f04b9d20173f254ce47a1f098d72 /tmv
parent: 57656c5f88e699a8a6b6af442c736b0ef95cbf3d (diff)
19 files changed, 8150 insertions, 0 deletions
diff --git a/tmv/_kiss_fft_guts_tm.h b/tmv/_kiss_fft_guts_tm.h
new file mode 100644
index 0000000..9cf2864
--- /dev/null
+++ b/tmv/_kiss_fft_guts_tm.h
@@ -0,0 +1,188 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file _kiss_fft_guts_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _KISS_FFT_GUTS_TM_
+#define _KISS_FFT_GUTS_TM_
+
+#ifdef TM_ASM
+
+#include <ops/custom_defs.h>
+
+#ifdef FIXED_POINT
+
+#undef	sround
+#define sround(x)	sex16(((x) + (1<<(FRACBITS-1)) ) >> FRACBITS)
+
+#undef	MIN
+#undef	MAX
+#define MIN(a,b)	imin(a,b)
+#define	MAX(a,b)	imax(a,b)
+
+#define TM_MUL(res,a,b)																\
+	{	register int a0, a1, b0, b1;												\
+																					\
+		a0	 = sex16((a));															\
+		a1	 = asri(16,(a));														\
+		b0	 = sex16((b));															\
+		b1	 = asri(16,(b));														\
+		(res)= pack16lsb(															\
+			sround(ifir16((a),funshift2((b),(b)))),									\
+			sround(a0*b0-a1*b1));													\
+	}																				\
+
+#define TM_ADD(res,a,b)																\
+	{	(res)=dspidualadd((a),(b));													\
+	}																				\
+
+#define TM_SUB(res,a,b)																\
+	{	(res)=dspidualsub((a),(b));													\
+	}																				\
+
+#define TM_SHR(res,a,shift)															\
+	{	(res)=dualasr((a),(shift));													\
+	}																				\
+
+#define TM_DIV(res,c,frac)															\
+	{	register int c1, c0;														\
+																					\
+		c1 = asri(16,(c));															\
+		c0 = sex16((c));															\
+		(res) = pack16lsb(sround(c1 * (32767/(frac))), sround(c0 * (32767/(frac))));\
+	}																				\
+
+#define TM_NEGMSB(res, a)															\
+	{	(res) = pack16lsb((ineg(asri(16,(a)))), (a));								\
+	}																				\
+
+#else
+
+#undef	MIN
+#undef	MAX
+#define MIN(a,b)	fmin(a,b)
+#define MAX(a,b)	fmax(a,b)
+
+#endif
+#endif
+
+#undef	CHECKBUF
+#define CHECKBUF(buf,nbuf,n)													\
+    {																			\
+        if ( nbuf < (size_t)(n) ) {												\
+            speex_free(buf);													\
+            buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); 	\
+            nbuf = (size_t)(n);													\
+        }																		\
+	}																			\
+
+#undef	 C_ADD
+#define  C_ADD( res, a,b)														\
+   {																			\
+	    CHECK_OVERFLOW_OP((a).r,+,(b).r)										\
+	    CHECK_OVERFLOW_OP((a).i,+,(b).i)										\
+	    (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i;								\
+    }																			\
+
+
+#undef	 C_SUB
+#define  C_SUB( res, a,b)														\
+    {																			\
+	    CHECK_OVERFLOW_OP((a).r,-,(b).r)										\
+	    CHECK_OVERFLOW_OP((a).i,-,(b).i)										\
+	    (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i;								\
+    }																			\
+
+#undef	C_ADDTO
+#define C_ADDTO( res , a)														\
+    {																			\
+	    CHECK_OVERFLOW_OP((res).r,+,(a).r)										\
+	    CHECK_OVERFLOW_OP((res).i,+,(a).i)										\
+	    (res).r += (a).r;  (res).i += (a).i;									\
+    }																			\
+
+#undef	C_SUBFROM
+#define C_SUBFROM( res, a)														\
+    {																			\
+	    CHECK_OVERFLOW_OP((res).r,-,(a).r)										\
+	    CHECK_OVERFLOW_OP((res).i,-,(a).i)										\
+	    (res).r -= (a).r;  (res).i -= (a).i;									\
+	}																			\
+
+#undef	 kf_cexp
+#define  kf_cexp(x,phase)														\
+	{	(x)->r = KISS_FFT_COS(phase);											\
+		(x)->i = KISS_FFT_SIN(phase); }											\
+
+#undef	 kf_cexp2
+#define  kf_cexp2(x,phase)														\
+    {	(x)->r = spx_cos_norm((phase));											\
+		(x)->i = spx_cos_norm((phase)-32768); }									\
+
+
+#ifdef FIXED_POINT
+
+#undef	C_MUL
+#define C_MUL(m,a,b)															\
+	{	(m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) );				\
+		(m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }				\
+
+#undef	C_FIXDIV
+#define C_FIXDIV(c,div)															\
+	{   DIVSCALAR( (c).r , div);												\
+		DIVSCALAR( (c).i  , div); }												\
+
+#undef	C_MULBYSCALAR
+#define C_MULBYSCALAR( c, s )													\
+    {	(c).r =  sround( smul( (c).r , s ) ) ;									\
+        (c).i =  sround( smul( (c).i , s ) ) ; }								\
+
+#else
+
+#undef	C_MUL
+#define C_MUL(m,a,b)															\
+	{	(m).r = (a).r*(b).r - (a).i*(b).i;										\
+        (m).i = (a).r*(b).i + (a).i*(b).r; }									\
+
+
+#undef	C_MULBYSCALAR
+#define C_MULBYSCALAR( c, s )													\
+    {	(c).r *= (s);															\
+        (c).i *= (s); }															\
+
+
+
+#endif
+
+#endif
+
diff --git a/tmv/cb_search_tm.h b/tmv/cb_search_tm.h
new file mode 100644
index 0000000..15ca1d5
--- /dev/null
+++ b/tmv/cb_search_tm.h
@@ -0,0 +1,149 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file cb_search_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_COMPUTE_WEIGHTED_CODEBOOK
+static void compute_weighted_codebook(
+	const signed char * restrict shape_cb, 
+	const Int16 * restrict r, 
+	Int16 * restrict resp, 
+	Int16 * restrict resp2, 
+	Int32 * restrict E, 
+	int shape_cb_size, int subvect_size, char *stack)
+{
+	register int i, j;
+	register int quadsize;
+
+	TMDEBUG_ALIGNMEM(r);
+	TMDEBUG_ALIGNMEM(resp);
+	TMDEBUG_ALIGNMEM(resp2);
+
+	COMPUTEWEIGHTEDCODEBOOK_START();
+
+	quadsize = subvect_size << 2;
+
+	for ( i=0 ; i<shape_cb_size ; i+=4 )
+	{	register int e0, e1, e2, e3;
+		
+		e0 = e1 = e2 = e3 = 0;
+		for( j=0 ; j<subvect_size ; ++j )
+		{
+			register int k, rr;
+			register int resj0, resj1, resj2, resj3;
+			
+			resj0 = resj1 = resj2 = resj3 = 0;
+
+			for ( k=0 ; k<=j ; ++k )
+			{	rr = r[j-k];
+				
+				resj0 += shape_cb[k] * rr;
+				resj1 += shape_cb[k+subvect_size] * rr;
+				resj2 += shape_cb[k+2*subvect_size] * rr;
+				resj3 += shape_cb[k+3*subvect_size] * rr;
+			}
+
+			resj0 >>= 13;
+			resj1 >>= 13;
+			resj2 >>= 13;
+			resj3 >>= 13;
+
+			e0 += resj0 * resj0;
+			e1 += resj1 * resj1;
+			e2 += resj2 * resj2;
+			e3 += resj3 * resj3;
+			
+			resp[j]					= resj0;
+			resp[j+subvect_size]	= resj1;
+			resp[j+2*subvect_size]	= resj2;
+			resp[j+3*subvect_size]	= resj3;
+		}
+
+		E[i]		= e0;
+		E[i+1]		= e1;
+		E[i+2]		= e2;
+		E[i+3]		= e3;
+
+		resp		+= quadsize;
+		shape_cb	+= quadsize;
+	}
+	
+#ifndef	REMARK_ON
+	(void)resp2;
+	(void)stack;
+#endif
+
+	COMPUTEWEIGHTEDCODEBOOK_STOP();
+}
+
+#define OVERRIDE_TARGET_UPDATE
+static inline void target_update(Int16 * restrict t, Int16 g, Int16 * restrict r, int len)
+{
+	register int n;
+	register int gr1, gr2, t1, t2, r1, r2;
+	register int quadsize;
+
+	TARGETUPDATE_START();
+
+	quadsize = len & 0xFFFFFFFC;
+	
+	for ( n=0; n<quadsize ; n+=4 )
+	{	gr1 = pack16lsb(PSHR32((g * r[n]),13)  , PSHR32((g * r[n+1]),13));
+		gr2 = pack16lsb(PSHR32((g * r[n+2]),13), PSHR32((g * r[n+3]),13));
+		
+		t1	= pack16lsb(t[n],  t[n+1]);
+		t2	= pack16lsb(t[n+2],t[n+3]);
+
+		r1	= dspidualsub(t1, gr1);
+		r2	= dspidualsub(t2, gr2);
+
+		t[n] 	= asri(16,r1);
+		t[n+1]	= sex16(r1);
+		t[n+2]	= asri(16,r2);
+		t[n+3]	= sex16(r2);
+	}
+
+	for ( n=quadsize ; n<len ; ++n )
+	{	t[n] = SUB16(t[n],PSHR32(MULT16_16(g,r[n]),13));
+	}
+
+	TARGETUPDATE_STOP();
+}
+
+#endif
+
diff --git a/tmv/config.h b/tmv/config.h
new file mode 100644
index 0000000..0b68fb8
--- /dev/null
+++ b/tmv/config.h
@@ -0,0 +1,98 @@
+#ifndef _CONFIG_H_
+#define _CONFIG_H_
+
+#define USE_COMPACT_KISS_FFT
+//#define USE_KISS_FFT
+
+#ifdef WIN32
+
+//#define		FIXED_POINT
+
+#define 	inline __inline
+#define 	restrict
+
+#elif defined (__TCS__)
+
+#define		FIXED_POINT
+#define		PREPROCESS_MDF_FLOAT
+#define		TM_ASM
+
+#define		TM_DEBUGMEM_ALIGNNMENT						1
+
+#define		TM_PROFILE									1	
+#define		TM_PROFILE_FIRMEM16							0	
+#define		TM_PROFILE_IIRMEM16							0	
+#define		TM_PROFILE_FILTERMEM16						0	
+#define		TM_PROFILE_VQNBEST							0	
+#define		TM_PROFILE_VQNBESTSIGN						0	
+#define		TM_PROFILE_COMPUTEQUANTWEIGHTS				0	
+#define		TM_PROFILE_LSPQUANT							0	
+#define		TM_PROFILE_LSPWEIGHTQUANT					0
+#define		TM_PROFILE_LSPENFORCEMARGIN					0
+#define		TM_PROFILE_LSPTOLPC							0
+#define		TM_PROFILE_INNERPROD						0	
+#define		TM_PROFILE_PITCHXCORR						0	
+#define		TM_PROFILE_LSP_INTERPOLATE					0	
+#define		TM_PROFILE_CHEBPOLYEVA						0	
+#define		TM_PROFILE_COMPUTEWEIGHTEDCODEBOOK			0	
+#define		TM_PROFILE_TARGETUPDATE						0	
+#define		TM_PROFILE_SPXAUTOCORR						0
+#define		TM_PROFILE_COMPUTEPITCHERROR				0
+#define		TM_PROFILE_COMPUTERMS16						0
+#define		TM_PROFILE_NORMALIZE16						0
+#define		TM_PROFILE_BWLPC							0
+#define		TM_PROFILE_HIGHPASS							0
+#define		TM_PROFILE_SIGNALMUL						0
+#define		TM_PROFILE_SIGNALDIV						0
+#define		TM_PROFILE_COMPUTEIMPULSERESPONSE			0
+#define		TM_PROFILE_PITCHGAINSEARCH3TAPVQ			0
+#define		TM_PROFILE_OPENLOOPNBESTPITCH				0
+#define		TM_PROFILE_PREPROCESSANALYSIS				0
+#define		TM_PROFILE_UPDATENOISEPROB					0
+#define		TM_PROFILE_COMPUTEGAINFLOOR					0
+#define		TM_PROFILE_FILTERDCNOTCH16					0
+#define		TM_PROFILE_MDFINNERPROD						0
+#define		TM_PROFILE_SPECTRALMULACCUM					0
+#define		TM_PROFILE_WEIGHTEDSPECTRALMULCONJ			0
+#define		TM_PROFILE_MDFADJUSTPROP					0
+#define		TM_PROFILE_SPEEXECHOGETRESIDUAL				0
+#define		TM_PROFILE_MAXIMIZERANGE					0
+#define		TM_PROFILE_RENORMRANGE						0
+#define		TM_PROFILE_POWERSPECTRUM					0
+#define		TM_PROFILE_QMFSYNTH							0
+#define		TM_PROFILE_QMFDECOMP						0
+#define		TM_PROFILE_FILTERBANKCOMPUTEBANK32			0
+#define		TM_PROFILE_FILTERBANKCOMPUTEPSD16			0
+
+#define		TM_UNROLL									1
+#define		TM_UNROLL_FILTER							1
+#define		TM_UNROLL_IIR								1
+#define		TM_UNROLL_FIR								1
+#define		TM_UNROLL_HIGHPASS							1
+#define		TM_UNROLL_SIGNALMUL							1
+#define		TM_UNROLL_SIGNALDIV							1
+#define		TM_UNROLL_VQNBEST							1
+#define		TM_UNROLL_VQSIGNNBEST						1
+#define		TM_UNROLL__SPXAUTOCORR						1
+#define		TM_UNROLL_COMPUTERMS16						1
+#define		TM_UNROLL_COMPUTEIMPULSERESPONSE			1
+#define		TM_UNROLL_QMFSYNTH							1
+#define		TM_UNROLL_PITCHGAINSEARCH3TAPVQ				1
+#define		TM_UNROLL_OPENLOOPNBESTPITCH				1
+#define		TM_UNROLL_FILTERBANKCOMPUTEBANK32			1
+#define		TM_UNROLL_FILTERBANKCOMPUTEPSD16			1
+#define		TM_UNROLL_SPEEXPREPROCESSRUN				1
+#define		TM_UNROLL_PREPROCESSANALYSIS				1
+#define		TM_UNROLL_UPDATENOISEPROB					1
+#define		TM_UNROLL_COMPUTEGAINFLOOR					1
+#define		TM_UNROLL_SPEEXECHOGETRESIDUAL				1
+#define		TM_UNROLL_SPEEXECHOCANCELLATION				1
+#define		TM_UNROLL_FILTERDCNOTCH16					1
+#define		TM_UNROLL_MDFINNERPRODUCT					1
+#define		TM_UNROLL_SPECTRALMULACCUM					1
+#define		TM_UNROLL_MDFADJUSTPROP						1
+
+#endif
+
+#endif
+
diff --git a/tmv/fftwrap_tm.h b/tmv/fftwrap_tm.h
new file mode 100644
index 0000000..39d924e
--- /dev/null
+++ b/tmv/fftwrap_tm.h
@@ -0,0 +1,233 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file fftwrap_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_MAXIMIZE_RANGE
+static int maximize_range(Int16 *in, Int16 *out, int bound, int len)
+{
+   	register int max_val=0;
+	register int shift=0;
+	register int i, j;
+
+	TMDEBUG_ALIGNMEM(in);
+	TMDEBUG_ALIGNMEM(out);
+
+	MAXIMIZERANGE_START();
+
+	len >>= 1;
+
+	for ( i=0 ; i<len ; i+=4 )
+	{
+		register int x10, x32, x54, x76;
+
+		x10 = ld32x(in,i);
+		x32 = ld32x(in,i+1);
+		x54 = ld32x(in,i+2);
+		x76 = ld32x(in,i+3);
+		
+		x10 = dspidualabs(x10);
+		x32 = dspidualabs(x32);
+		x54 = dspidualabs(x54);
+		x76 = dspidualabs(x76);
+		
+		x10 = imax(sex16(x10), asri(16,x10));
+		x32 = imax(sex16(x32), asri(16,x32));
+		x54 = imax(sex16(x54), asri(16,x54));
+		x76 = imax(sex16(x76), asri(16,x76));
+
+		max_val = imax(max_val,x10);
+		max_val = imax(max_val,x32);
+		max_val = imax(max_val,x54);
+		max_val = imax(max_val,x76);
+	}
+
+	while ( max_val <= (bound>>1) && max_val != 0 )
+	{	max_val <<= 1;
+		shift++;
+	}
+
+	if ( shift != 0 )
+	{
+		for ( i=0,j=0 ; i<len ; i+=4,j+=16 )
+		{
+			register int x10, x32, x54, x76;
+
+			x10 = ld32x(in,i);
+			x32 = ld32x(in,i+1);
+			x54 = ld32x(in,i+2);
+			x76 = ld32x(in,i+3);
+
+			x10 = dualasl(x10, shift);
+			x32 = dualasl(x32, shift);
+			x54 = dualasl(x54, shift);
+			x76 = dualasl(x76, shift);
+
+			st32d(j,out,x10);
+			st32d(j+4,out,x32);
+			st32d(j+8,out,x54);
+			st32d(j+12,out,x76);
+		}   
+	}
+
+	MAXIMIZERANGE_STOP();
+
+	return shift;
+}
+
+#define OVERRIDE_RENORM_RANGE
+static void renorm_range(Int16 *in, Int16 *out, int shift, int len)
+{
+	register int i, j, s, l;
+
+	TMDEBUG_ALIGNMEM(in);
+	TMDEBUG_ALIGNMEM(out);
+
+	RENORMRANGE_START();
+
+	s = (1<<((shift))>>1);
+	s = pack16lsb(s,s);
+
+	len >>= 1;
+	l = len & (int)0xFFFFFFFE;
+
+   	for ( i=0,j=0 ; i<l; i+=2,j+=8 )
+	{
+		register int x10, x32;
+
+		x10 = ld32x(in,i);
+		x32 = ld32x(in,i+1);
+
+		x10 = dspidualadd(x10, s);
+		x32 = dspidualadd(x32, s);
+
+		x10 = dualasr(x10, shift);
+		x32 = dualasr(x32, shift);
+
+		st32d(j,out,x10);
+		st32d(j+4,out,x32);
+	}
+
+	if ( len & (int)0x01 )
+	{
+		register int x10;
+
+		x10 = ld32x(in,i);
+		x10 = dspidualadd(x10, s);
+		x10 = dualasr(x10, shift);
+		st32d(j,out,x10);
+	}
+
+	RENORMRANGE_STOP();
+}
+
+#endif
+
+#ifdef USE_COMPACT_KISS_FFT 
+#ifdef FIXED_POINT
+
+#define OVERRIDE_POWER_SPECTRUM
+void power_spectrum(const spx_word16_t *X, spx_word32_t *ps, int N)
+{
+	register int x10, x32, x54, x76, *x;
+	register int i;
+
+	x = (int*)(X-1);
+
+	TMDEBUG_ALIGNMEM(x);
+
+	POWERSPECTRUM_START();
+
+	x76 = 0;
+	ps[0] = MULT16_16(X[0],X[0]);
+	N >>= 1;
+   
+	for( i=1 ; i<N ; i+=4 )
+	{	
+		x10 = ld32x(x, i);
+		x32 = ld32x(x, i+1);
+		x54 = ld32x(x, i+2);
+		x76 = ld32x(x, i+3);
+
+		ps[i]	= ifir16(x10,x10);
+		ps[i+1] = ifir16(x32,x32);
+		ps[i+2] = ifir16(x54,x54);
+		ps[i+3] = ifir16(x76,x76);
+	}
+
+	x76 = sex16(x76);
+	ps[N] = x76 * x76;
+
+	POWERSPECTRUM_STOP();
+}
+
+#else
+
+#define OVERRIDE_POWER_SPECTRUM
+void power_spectrum(const float * restrict X, float * restrict ps, int N)
+{
+	register int i, j;
+	register float xx;
+   
+	POWERSPECTRUM_START();
+
+	xx = X[0];
+
+	ps[0]=MULT16_16(xx,xx);
+
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+   for (i=1,j=1;i<N-1;i+=2,j++)
+   {	register float xi, xii;
+
+		xi = X[i];
+		xii = X[i+1];
+      
+		ps[j] =  MULT16_16(xi,xi) + MULT16_16(xii,xii);
+   }
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+
+   xx = X[i];
+   ps[j]=MULT16_16(xx,xx);
+
+   POWERSPECTRUM_STOP();
+}
+
+#endif
+#endif
+
diff --git a/tmv/filterbank_tm.h b/tmv/filterbank_tm.h
new file mode 100644
index 0000000..39d2332
--- /dev/null
+++ b/tmv/filterbank_tm.h
@@ -0,0 +1,289 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file filterbank_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_FILTERBANK_COMPUTE_BANK32
+void filterbank_compute_bank32(FilterBank * restrict bank, spx_word32_t * restrict ps, spx_word32_t * restrict mel)
+{
+	register int i, j, k, banks, len, zero, s;
+	register int * restrict left;
+	register int * restrict right; 
+	register int * restrict bleft;
+	register int * restrict bright;
+
+	left = (int*)bank->filter_left;
+	right = (int*)bank->filter_right;
+	bleft = (int*)bank->bank_left;
+	bright = (int*)bank->bank_right;
+
+   	TMDEBUG_ALIGNMEM(ps);
+	TMDEBUG_ALIGNMEM(mel);
+	TMDEBUG_ALIGNMEM(left);
+	TMDEBUG_ALIGNMEM(right);
+	TMDEBUG_ALIGNMEM(bleft);
+	TMDEBUG_ALIGNMEM(bright);
+
+	FILTERBANKCOMPUTEBANK32_START();
+
+	banks = bank->nb_banks << 2;
+	zero = 0;
+	len = bank->len;
+	s = (1<<((15))>>1);
+
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEBANK32)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<banks ; i+=4 )
+	{	st32d(i, mel, zero);
+	}
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEBANK32)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEBANK32)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+   for ( i=0,j=1,k=0 ; i<len ; i+=2,j+=2,++k )
+   {	register int ps1, ps0, _mel, ps0_msb, ps0_lsb, ps1_msb, ps1_lsb; 
+		register int left10, right10, left1, left0, right1, right0;
+		register int il1, ir1, il0, ir0;
+
+		ps0		= ld32x(ps,i);
+		il0		= ld32x(bleft,i);
+		_mel	= ld32x(mel,il0);
+		left10	= ld32x(left,k);		
+		ir0		= ld32x(bright,i);
+		right10	= ld32x(right,k);
+
+		ps0_msb	= ps0 >> 15;
+		ps0_lsb = ps0 & 0x00007fff;
+		left0	= sex16(left10);
+		right0	= sex16(right10);
+
+		_mel	+= left0 * ps0_msb + ((left0 * ps0_lsb + s ) >> 15);
+		mel[il0]= _mel;
+		_mel	= ld32x(mel,ir0);
+		_mel	+= right0 * ps0_msb + ((right0 * ps0_lsb + s ) >> 15);
+		mel[ir0]= _mel;
+
+		ps1		= ld32x(ps,j);
+		il1		= ld32x(bleft,j);
+		_mel	= ld32x(mel,il1);
+		ir1		= ld32x(bright,j);
+
+		left1	= asri(16,left10);
+		right1	= asri(16,right10);
+		ps1_msb	= ps1 >> 15;
+		ps1_lsb	= ps1 & 0x00007fff;
+
+		_mel	+= left1 * ps1_msb + ((left1 * ps1_lsb + s ) >> 15);
+		mel[il1]= _mel;
+		_mel	= ld32x(mel,ir1);
+		_mel	+= right1 * ps1_msb + ((right1 * ps1_lsb + s ) >> 15);
+		mel[ir1]= _mel;
+   }
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEBANK32)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+   FILTERBANKCOMPUTEBANK32_STOP();
+}
+
+#define OVERRIDE_FILTERBANK_COMPUTE_PSD16
+void filterbank_compute_psd16(FilterBank * restrict bank, spx_word16_t * restrict mel, spx_word16_t * restrict ps)
+{
+	register int i, j, k, len, s;
+	register int * restrict left;
+	register int * restrict right; 
+	register int * restrict bleft;
+	register int * restrict bright;
+
+	left = (int*)bank->filter_left;
+	right = (int*)bank->filter_right;
+	bleft = (int*)bank->bank_left;
+	bright = (int*)bank->bank_right;
+
+   	TMDEBUG_ALIGNMEM(ps);
+	TMDEBUG_ALIGNMEM(mel);
+	TMDEBUG_ALIGNMEM(left);
+	TMDEBUG_ALIGNMEM(right);
+	TMDEBUG_ALIGNMEM(bleft);
+	TMDEBUG_ALIGNMEM(bright);
+
+	FILTERBANKCOMPUTEPSD16_START();
+
+	len = bank->len;
+	s = (1<<((15))>>1);
+
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEPSD16)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,j=0,k=0 ; i<len ; i+=2,j+=4,++k )
+	{
+		register int mell0, melr0, mel1, mel0, mell1, melr1;
+		register int il1, ir1, il0, ir0;
+		register int left10, right10, lr1, lr0;
+		register int acc0, acc1, ps10;
+
+		acc0 = acc1 = s;
+
+		il0		= ld32x(bleft, i);
+		ir0		= ld32x(bright,i);
+		mell0	= mel[il0]; 
+		melr0	= mel[ir0];
+		left10	= ld32x(left,  k);
+		right10 = ld32x(right, k);
+		mel0	= pack16lsb(mell0, melr0);
+		lr0		= pack16lsb(left10, right10);
+		
+		acc0	+= ifir16(mel0, lr0);
+		acc0	>>= 15;
+		
+		il1		= ld32x(bleft, i+1);
+		ir1		= ld32x(bright,i+1);
+		mell1	= mel[il1];
+		melr1	= mel[ir1];
+		mel1	= pack16lsb(mell1, melr1);
+		lr1		= pack16msb(left10, right10);
+			
+		acc1	+= ifir16(mel1, lr1);
+		acc1	>>= 15;
+
+		ps10	= pack16lsb(acc1, acc0);
+		
+		st32d(j, ps, ps10);
+	}
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEPSD16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	FILTERBANKCOMPUTEPSD16_STOP();
+}
+
+#else
+
+#define OVERRIDE_FILTERBANK_COMPUTE_BANK32
+void filterbank_compute_bank32(FilterBank * restrict bank, float * restrict ps, float * restrict mel)
+{
+	register int i, banks, len;
+	register int * restrict bleft, * restrict bright;
+	register float * restrict left, * restrict right;
+
+	banks = bank->nb_banks;
+	len	 = bank->len;
+	bleft = bank->bank_left;
+	bright= bank->bank_right;
+	left	 = bank->filter_left;
+	right = bank->filter_right;
+
+	FILTERBANKCOMPUTEBANK32_START();
+
+	memset(mel, 0, banks * sizeof(float));
+
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEBANK32)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+   for ( i=0 ; i<len ; ++i)
+   {
+      register int id1, id2;
+	  register float psi;
+
+      id1 = bleft[i];
+      id2 = bright[i];
+      psi = ps[i];
+	  
+	  mel[id1] += left[i] * psi;
+      mel[id2] += right[i] * psi;
+   }
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEBANK32)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+   FILTERBANKCOMPUTEBANK32_STOP();
+}
+
+#define OVERRIDE_FILTERBANK_COMPUTE_PSD16
+void filterbank_compute_psd16(FilterBank * restrict bank, float * restrict mel, float * restrict ps)
+{
+	register int i, len;
+	register int * restrict bleft, * restrict bright;
+	register float * restrict left, * restrict right;
+
+	len	  = bank->len;
+	bleft = bank->bank_left;
+	bright= bank->bank_right;
+	left  = bank->filter_left;
+	right = bank->filter_right;
+
+	FILTERBANKCOMPUTEPSD16_START();
+
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEPSD16)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<len ; ++i )
+	{
+		register float acc;
+		register int id1, id2;
+
+		id1 = bleft[i];
+		id2 = bright[i];
+
+		acc = mel[id1] * left[i];
+		acc += mel[id2] * right[i];
+		
+		ps[i] = acc;
+	}
+#if (TM_UNROLL && TM_UNROLL_FILTERBANKCOMPUTEPSD16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	 FILTERBANKCOMPUTEPSD16_STOP();
+}
+
+
+#endif
diff --git a/tmv/filters_tm.h b/tmv/filters_tm.h
new file mode 100644
index 0000000..8f28102
--- /dev/null
+++ b/tmv/filters_tm.h
@@ -0,0 +1,1521 @@
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define iadd(a,b)	((a) + (b))
+
+#define OVERRIDE_BW_LPC
+void bw_lpc(Int16 gamma, const Int16 *lpc_in, Int16 *lpc_out, int order)
+{
+	register int tmp, g, i;
+	
+	TMDEBUG_ALIGNMEM(lpc_in);
+	TMDEBUG_ALIGNMEM(lpc_out);
+
+	BWLPC_START();
+
+	tmp = g = gamma;
+	for ( i=0 ; i<4 ; i+=2,lpc_out+=4 )
+	{	register int in10, y1, y0, y10;
+		register int in32, y3, y2, y32;
+
+		in10 = ld32x(lpc_in,i);
+		y0	 = ((tmp * sex16(in10)) + 16384) >> 15;
+		tmp  = ((tmp * g) + 16384) >> 15;
+		y1	 = ((tmp * asri(16,in10)) + 16384) >> 15;
+		tmp  = ((tmp * g) + 16384) >> 15;
+		y10  =  pack16lsb(y1,y0);
+		st32(lpc_out,y10);
+
+		in32 = ld32x(lpc_in,i+1);
+		y2	 = ((tmp * sex16(in32)) + 16384) >> 15;
+		tmp  = ((tmp * g) + 16384) >> 15;
+		y3	 = ((tmp * asri(16,in32)) + 16384) >> 15;
+		tmp  = ((tmp * g) + 16384) >> 15;
+		y32  =  pack16lsb(y3,y2);
+		st32d(4,lpc_out,y32);
+	}
+
+	if ( order == 10 )
+	{	register int in10, y1, y0, y10;
+		
+		in10 = ld32x(lpc_in,i);
+		y0	 = ((tmp * sex16(in10)) + 16384) >> 15;
+		tmp  = ((tmp * g) + 16384) >> 15;
+		y1	 = ((tmp * asri(16,in10)) + 16384) >> 15;
+		tmp  = ((tmp * g) + 16384) >> 15;
+		y10  =  pack16lsb(y1,y0);
+		st32(lpc_out,y10);
+	}
+
+	BWLPC_STOP();
+}
+
+
+#define OVERRIDE_HIGHPASS
+void highpass(const Int16 *x, Int16 *y, int len, int filtID, Int32 *mem)
+{
+   	const Int16 Pcoef[5][3] = {{16384, -31313, 14991}, {16384, -31569, 15249}, {16384, -31677, 15328}, {16384, -32313, 15947}, {16384, -22446, 6537}};
+	const Int16 Zcoef[5][3] = {{15672, -31344, 15672}, {15802, -31601, 15802}, {15847, -31694, 15847}, {16162, -32322, 16162}, {14418, -28836, 14418}};
+	register int i;
+	register int den1, den2, num0, num1, num2, den11, den22, mem0, mem1;
+   
+	TMDEBUG_ALIGNMEM(mem);
+
+	HIGHPASS_START();
+
+	filtID = imin(4, filtID);
+
+	den1 = -Pcoef[filtID][1];
+	den2 = -Pcoef[filtID][2];
+	num0 = Zcoef[filtID][0];
+	num1 = Zcoef[filtID][1];
+	num2 = Zcoef[filtID][2];
+	den11 = den1 << 1;
+	den22 = den2 << 1;
+	mem0 = mem[0];
+	mem1 = mem[1];
+	
+#if (TM_UNROLL && TM_UNROLL_HIGHPASS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<len ; ++i )
+	{
+		register int yi;
+		register int vout, xi, vout_i, vout_d;
+			
+		xi = x[i];
+
+		vout	= num0 * xi + mem0;
+		vout_i	= vout >> 15;
+		vout_d	= vout & 0x7FFF;
+		yi		= iclipi(PSHR32(vout,14),32767);
+		mem0	= (mem1 + num1 * xi) + (den11 * vout_i) + (((den1 * vout_d) >> 15) << 1);
+		mem1	= (num2 * xi) + (den22 * vout_i) + (((den2 *  vout_d) >> 15) << 1);
+		
+		y[i] = yi;
+	}
+#if (TM_UNROLL && TM_UNROLL_HIGHPASS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = mem0;
+	mem[1] = mem1;
+
+	HIGHPASS_STOP();
+}
+
+
+#define OVERRIDE_SIGNALMUL
+void signal_mul(const Int32 *x, Int32 *y, Int32 scale, int len)
+{
+	register int i, scale_i, scale_d;
+   
+	SIGNALMUL_START();
+
+	scale_i = scale >> 14;
+	scale_d = scale & 0x3FFF;
+
+#if (TM_UNROLL && TM_UNROLL_SIGNALMUL)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<len ; ++i)
+	{
+		register int xi;
+
+		xi = x[i] >> 7;
+
+		y[i] = ((xi * scale_i + ((xi * scale_d) >> 14)) << 7);
+			
+	}
+#if (TM_UNROLL && TM_UNROLL_SIGNALMUL)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	SIGNALMUL_STOP();
+}
+
+#define OVERRIDE_SIGNALDIV
+void signal_div(const Int16 *x, Int16 *y, Int32 scale, int len)
+{
+	register int i;
+   
+	SIGNALDIV_START();
+
+	if (scale > SHL32(EXTEND32(SIG_SCALING), 8))
+	{
+		register int scale_1;
+		scale	= PSHR32(scale, SIG_SHIFT);
+		scale_1 = DIV32_16(SHL32(EXTEND32(SIG_SCALING),7),scale);
+#if (TM_UNROLL && TM_UNROLL_SIGNALDIV)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<len ; ++i)
+		{
+			y[i] = MULT16_16_P15(scale_1, x[i]);
+		}
+#if (TM_UNROLL && TM_UNROLL_SIGNALDIV)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	} else if (scale > SHR32(EXTEND32(SIG_SCALING), 2)) {
+      
+		register int scale_1;
+		scale	= PSHR32(scale, SIG_SHIFT-5);
+		scale_1 = DIV32_16(SHL32(EXTEND32(SIG_SCALING),3),scale);
+#if (TM_UNROLL && TM_UNROLL_SIGNALDIV)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for (i=0;i<len;i++)
+		{
+			y[i] = PSHR32(MULT16_16(scale_1, SHL16(x[i],2)),8);
+		}
+#if (TM_UNROLL && TM_UNROLL_SIGNALDIV)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	} else {
+		
+		register int scale_1;
+		scale = PSHR32(scale, SIG_SHIFT-7);
+		scale = imax(5,scale);
+		scale_1 = DIV32_16(SHL32(EXTEND32(SIG_SCALING),3),scale);
+
+#if (TM_UNROLL && TM_UNROLL_SIGNALDIV)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for (i=0;i<len;i++)
+		{
+			 y[i] = PSHR32(MULT16_16(scale_1, SHL16(x[i],2)),6);
+		}
+#if (TM_UNROLL && TM_UNROLL_SIGNALDIV)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+
+	SIGNALMUL_STOP();
+}
+
+
+#define OVERRIDE_COMPUTE_RMS
+Int16 compute_rms(const Int32 *x, int len)
+{
+	register int i;
+	register int sum=0;
+	register int max_val=1;
+	register int sig_shift;
+	
+	TMDEBUG_ALIGNMEM(x);
+
+	for ( i=0 ; i<len ; i+=4 )
+	{
+		register int tmp0, tmp1, tmp2, tmp3;
+
+		tmp0 = ld32x(x,i);
+		tmp1 = ld32x(x,i+1);
+
+		tmp0 = iabs(tmp0);
+		tmp1 = iabs(tmp1);
+
+		tmp2 = ld32x(x,i+2);
+		tmp3 = ld32x(x,i+3);
+
+		tmp2 = iabs(tmp2);
+		tmp3 = iabs(tmp3);
+
+		tmp0 = imax(tmp0, tmp1);
+		max_val = imax(max_val, tmp0); 
+		tmp2 = imax(tmp2, tmp3);		
+		max_val = imax(max_val, tmp2);
+	}
+
+	sig_shift = 0;
+	while ( max_val>16383 )
+	{	sig_shift++;
+		max_val >>= 1;
+	}
+
+
+	for ( i=0 ; i<len ; i+=4 )
+	{
+		register int acc0, acc1, acc2;
+
+		acc0 =  pack16lsb(ld32x(x,i) >> sig_shift, ld32x(x,i+1) >> sig_shift);
+		acc1 =  pack16lsb(ld32x(x,i+2) >> sig_shift, ld32x(x,i+3) >> sig_shift);
+		acc2 =  ifir16(acc0,acc0) + ifir16(acc1, acc1);
+		sum	 += acc2 >> 6;
+	}
+   
+   return EXTRACT16(PSHR32(SHL32(EXTEND32(spx_sqrt(DIV32(sum,len))),(sig_shift+3)),SIG_SHIFT));
+}
+
+#define OVERRIDE_COMPUTE_RMS16
+Int16 compute_rms16(const Int16 *x, int len)
+{
+	register int max_val, i;
+	
+	COMPUTERMS16_START();
+	
+	max_val = 10;
+
+#if 0
+
+	{
+		register int len2 = len >> 1;
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<len2 ; i+=2 )
+		{
+			register int x10, x32;
+
+			x10 = ld32x(x,i);
+			x32 = ld32x(x,i+1);
+			 
+			x10 = dspidualabs(x10);
+			x32 = dspidualabs(x32);
+			
+			x10 = imax(sex16(x10), asri(16,x10));
+			x32 = imax(sex16(x32), asri(16,x32));
+			
+			max_val = imax(max_val,x10);
+			max_val = imax(max_val,x32);
+		}
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+		if (max_val>16383)
+		{
+			register int sum = 0;
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+			for ( i=0 ; i<len_2; i+=2 )
+			{
+				register int x10, x32;
+				register int acc0, acc1;
+
+				x10 = ld32x(x,i);
+				x32 = ld32x(x,i+1);
+				
+				x10 = dualasr(x10,1);
+				x32 = dualasr(x32,1);
+
+				acc0 = ifir16(x10,x10);
+				acc1 = ifir16(x32,x32);
+				sum  += (acc0 + acc1) >> 6;
+			}
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+			COMPUTERMS16_STOP();
+			return spx_sqrt(sum/len) << 4;
+		} else 
+		{
+			register int sig_shift;
+			register int sum=0;
+
+			sig_shift = mux(max_val < 8192, 1, 0);
+			sig_shift = mux(max_val < 4096, 2, sig_shift);
+			sig_shift = mux(max_val < 2048, 3, sig_shift);
+
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+			for ( i=0 ; i<len_2 ; i+=2 )
+			{
+				register int x10, x32;
+				register int acc0, acc1;
+
+				x10 = ld32x(x,i);
+				x32 = ld32x(x,i+1);
+
+				x10 = dualasl(x10,sig_shift);
+				x32 = dualasl(x32,sig_shift);
+				
+				acc0 = ifir16(x10,x10);
+				acc1 = ifir16(x32,x32);
+				sum  += (acc0 + acc1) >> 6;
+			}
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+			COMPUTERMS16_STOP();
+			return spx_sqrt(sum/len) << (3 - sig_shift);  
+		}
+	}
+
+#else
+	{
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<len ; ++i )
+		{
+			register int xi;
+
+			xi = x[i];
+			xi = iabs(xi);
+			max_val = imax(xi,max_val);
+		}
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+		if (max_val>16383)
+		{
+			register int sum = 0;
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+			for ( i=0 ; i<len ; i+=4 )
+			{
+				register int x10, x32;
+				register int acc0, acc1;
+
+				x10 = pack16lsb(x[i+1],x[i]);
+				x32 = pack16lsb(x[i+3],x[i+2]);
+				
+				x10 = dualasr(x10,1);
+				x32 = dualasr(x32,1);
+
+				acc0 = ifir16(x10,x10);
+				acc1 = ifir16(x32,x32);
+				sum  += (acc0 + acc1) >> 6;
+			}
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+			COMPUTERMS16_STOP();
+			return spx_sqrt(sum/len) << 4;
+		} else {
+			register int sig_shift;
+			register int sum=0;
+
+			sig_shift = mux(max_val < 8192, 1, 0);
+			sig_shift = mux(max_val < 4096, 2, sig_shift);
+			sig_shift = mux(max_val < 2048, 3, sig_shift);
+
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+			for ( i=0 ; i<len ; i+=4 )
+			{
+				register int x10, x32;
+				register int acc0, acc1;
+
+				x10 = pack16lsb(x[i+1],x[i]);
+				x32 = pack16lsb(x[i+3],x[i+2]);
+
+				x10 = dualasl(x10,sig_shift);
+				x32 = dualasl(x32,sig_shift);
+				
+				acc0 = ifir16(x10,x10);
+				acc1 = ifir16(x32,x32);
+				sum  += (acc0 + acc1) >> 6;
+			}
+#if (TM_UNROLL && TM_UNROLL_COMPUTERMS16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+			COMPUTERMS16_STOP();
+			return spx_sqrt(sum/len) << (3 - sig_shift);  
+		}
+	}
+#endif
+}
+
+int normalize16_9(const Int32* restrict x, Int16 * restrict y, Int32 max_scale)
+{
+	register int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+	register int max_val, m0, m1, m2, m3, m4;
+	register int sig_shift;
+	register int y10, y32, y54, y76;
+
+	TMDEBUG_ALIGNMEM(x);
+
+	x0 = ld32(x); 
+	x1 = ld32x(x,1); x2 = ld32x(x,2); x3 = ld32x(x,3); x4 = ld32x(x,4); 
+	x5 = ld32x(x,5); x6 = ld32x(x,6); x7 = ld32x(x,7); x8 = ld32x(x,8);
+
+	m0 = imax(iabs(x0), iabs(x1));
+	m1 = imax(iabs(x2), iabs(x3));
+	m2 = imax(iabs(x4), iabs(x5));
+	m3 = imax(iabs(x6), iabs(x7));
+	m4 = imax(m0, iabs(x8));
+	m1 = imax(m1, m2);
+	m3 = imax(m3, m4);
+	max_val = imax(1,imax(m1,m3));
+
+	sig_shift=0;
+	while (max_val>max_scale)
+	{	sig_shift++;
+		max_val >>= 1;
+	}
+
+	if ( sig_shift != 0 )
+	{
+		y10 = pack16lsb(x1 >> sig_shift, x0 >> sig_shift);
+		y32 = pack16lsb(x3 >> sig_shift, x2 >> sig_shift);
+		y54 = pack16lsb(x5 >> sig_shift, x4 >> sig_shift);
+		y76 = pack16lsb(x7 >> sig_shift, x6 >> sig_shift);
+
+		y[8] = x8 >> sig_shift;
+		st32(y,y10);
+		st32d(4,y,y32);
+		st32d(8,y,y54);
+		st32d(12,y,y76);
+	}
+	return sig_shift;
+}
+
+int normalize16_mod8(const Int32 * restrict x, Int16 * restrict y, Int32 max_scale,int len)
+{
+	register int i, max_val, sig_shift;
+	
+	TMDEBUG_ALIGNMEM(x);
+
+	max_val = 1;
+
+	for ( i=0 ; i<len ; i+=4 )
+	{
+		register int tmp0, tmp1, tmp2, tmp3;
+
+		tmp0 = ld32x(x,i);
+		tmp1 = ld32x(x,i+1);
+
+		tmp0 = iabs(tmp0);
+		tmp1 = iabs(tmp1);
+
+		tmp2 = ld32x(x,i+2);
+		tmp3 = ld32x(x,i+3);
+
+		tmp2 = iabs(tmp2);
+		tmp3 = iabs(tmp3);
+
+		tmp0 = imax(tmp0, tmp1);
+		max_val = imax(max_val, tmp0); 
+		tmp2 = imax(tmp2, tmp3);		
+		max_val = imax(max_val, tmp2);
+	}
+
+	sig_shift=0;
+	while (max_val>max_scale)
+	{	sig_shift++;
+		max_val >>= 1;
+	}
+
+	if ( sig_shift != 0 )
+	{
+		for ( i=0 ; i<len ; i+=8, y+=8 )
+		{
+			register int x0, x1, x2, x3, x4, x5, x6, x7;
+			register int y10, y32, y54, y76;
+
+			x0 = ld32x(x,i);   x1 = ld32x(x,i+1); x2 = ld32x(x,i+2); x3 = ld32x(x,i+3); 
+			x4 = ld32x(x,i+4); x5 = ld32x(x,i+5); x6 = ld32x(x,i+6); x7 = ld32x(x,i+7);
+
+			y10 = pack16lsb(x1 >> sig_shift, x0 >> sig_shift);
+			y32 = pack16lsb(x3 >> sig_shift, x2 >> sig_shift);
+			y54 = pack16lsb(x5 >> sig_shift, x4 >> sig_shift);
+			y76 = pack16lsb(x7 >> sig_shift, x6 >> sig_shift);
+
+			st32(y,y10);
+			st32d(4,y,y32);
+			st32d(8,y,y54);
+			st32d(12,y,y76);
+		}
+	}
+	return sig_shift;
+}
+
+
+#define OVERRIDE_NORMALIZE16
+int normalize16(const Int32 *x, Int16 *y, Int32 max_scale, int len)
+{
+	TMDEBUG_ALIGNMEM(x);
+	TMDEBUG_ALIGNMEM(y);
+
+	NORMALIZE16_START();
+
+	if ( len == 9 )
+	{	NORMALIZE16_STOP();
+		return normalize16_9(x,y,max_scale);
+	} else
+	{	NORMALIZE16_STOP();
+		return normalize16_mod8(x,y,max_scale,len);
+	}
+}
+
+
+void filter_mem16_10(const Int16 *x, const Int16 *num, const Int16 *den, Int16 *y, int N, Int32 *mem)
+{
+	register int i;
+	register int c9, c8, c7, c6, c5;
+	register int c4, c3, c2, c1, c0;
+	register int input;
+	register int output_0, output_1, output_2, output_3, output_4;
+	register int output_5, output_6, output_7, output_8, output_9;
+    register Int16 xi, yi;
+
+	c9 = pack16lsb(-den[9],num[9]);
+	c8 = pack16lsb(-den[8],num[8]);
+	c7 = pack16lsb(-den[7],num[7]);
+	c6 = pack16lsb(-den[6],num[6]);
+	c5 = pack16lsb(-den[5],num[5]);
+	c4 = pack16lsb(-den[4],num[4]);
+	c3 = pack16lsb(-den[3],num[3]);
+	c2 = pack16lsb(-den[2],num[2]);
+	c1 = pack16lsb(-den[1],num[1]);
+	c0 = pack16lsb(-den[0],num[0]);
+
+	output_0 = mem[0];
+	output_1 = mem[1];
+	output_2 = mem[2];
+	output_3 = mem[3];
+	output_4 = mem[4];
+	output_5 = mem[5];
+	output_6 = mem[6];
+	output_7 = mem[7];
+	output_8 = mem[8];
+	output_9 = mem[9];
+
+#if (TM_UNROLL && TM_UNROLL_FILTER)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0 ; i<N ; i++ )
+    {
+		xi = (int)(x[i]);
+		yi = iclipi(iadd(xi,PSHR32(output_0,LPC_SHIFT)),32767);
+
+		input	= pack16lsb(yi,xi);
+		output_0= iadd(ifir16(c0,input),output_1);
+		output_1= iadd(ifir16(c1,input),output_2);
+		output_2= iadd(ifir16(c2,input),output_3);
+		output_3= iadd(ifir16(c3,input),output_4);
+		output_4= iadd(ifir16(c4,input),output_5);
+		output_5= iadd(ifir16(c5,input),output_6);
+		output_6= iadd(ifir16(c6,input),output_7);
+		output_7= iadd(ifir16(c7,input),output_8);
+		output_8= iadd(ifir16(c8,input),output_9);
+		output_9= ifir16(c9,input);
+
+		y[i] = yi;
+   }
+
+#if (TM_UNROLL && TM_UNROLL_FILTER)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = output_0;
+	mem[1] = output_1;
+	mem[2] = output_2;
+	mem[3] = output_3;
+	mem[4] = output_4;
+	mem[5] = output_5;
+	mem[6] = output_6;
+	mem[7] = output_7;
+	mem[8] = output_8;
+	mem[9] = output_9;
+}
+
+void filter_mem16_8(const Int16 *x, const Int16 *num, const Int16 *den, Int16 *y, int N, Int32 *mem)
+{
+	register int i;
+	register int c7, c6, c5, c4, c3, c2, c1, c0;
+	register int output_0, output_1, output_2, output_3, output_4, output_5, output_6, output_7;
+	register int input;
+    register Int16 xi, yi;
+
+	c7 = pack16lsb(-den[7],num[7]);
+	c6 = pack16lsb(-den[6],num[6]);
+	c5 = pack16lsb(-den[5],num[5]);
+	c4 = pack16lsb(-den[4],num[4]);
+	c3 = pack16lsb(-den[3],num[3]);
+	c2 = pack16lsb(-den[2],num[2]);
+	c1 = pack16lsb(-den[1],num[1]);
+	c0 = pack16lsb(-den[0],num[0]);
+
+	output_0 = mem[0];
+	output_1 = mem[1];
+	output_2 = mem[2];
+	output_3 = mem[3];
+	output_4 = mem[4];
+	output_5 = mem[5];
+	output_6 = mem[6];
+	output_7 = mem[7];
+
+#if (TM_UNROLL && TM_UNROLL_FILTER)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0 ; i<N ; i++ )
+    {
+		xi = x[i];
+		yi = iclipi(iadd((int)(xi),PSHR32(output_0,LPC_SHIFT)),32767);
+
+		input	= pack16lsb(yi,xi);
+		output_0= iadd(ifir16(c0,input),output_1);
+		output_1= iadd(ifir16(c1,input),output_2);
+		output_2= iadd(ifir16(c2,input),output_3);
+		output_3= iadd(ifir16(c3,input),output_4);
+		output_4= iadd(ifir16(c4,input),output_5);
+		output_5= iadd(ifir16(c5,input),output_6);
+		output_6= iadd(ifir16(c6,input),output_7);
+		output_7= ifir16(c7,input);
+
+		y[i] = yi;
+   }
+
+#if (TM_UNROLL && TM_UNROLL_FILTER)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+
+	mem[0] = output_0;
+	mem[1] = output_1;
+	mem[2] = output_2;
+	mem[3] = output_3;
+	mem[4] = output_4;
+	mem[5] = output_5;
+	mem[6] = output_6;
+	mem[7] = output_7;
+}
+
+#define OVERRIDE_FILTER_MEM16
+void filter_mem16(const Int16 *x, const Int16 *num, const Int16 *den, Int16 *y, int N, int ord, Int32 *mem, char *stack)
+{
+	TMDEBUG_ALIGNMEM(x);
+	TMDEBUG_ALIGNMEM(y);
+	TMDEBUG_ALIGNMEM(num);
+	TMDEBUG_ALIGNMEM(den);
+
+	FILTERMEM16_START();
+
+	if(ord==10)
+		filter_mem16_10(x, num, den, y, N, mem);
+	else if (ord==8)
+		filter_mem16_8(x, num, den, y, N, mem);
+
+#ifndef REMARK_ON
+	(void)stack;
+#endif
+
+	FILTERMEM16_STOP();
+}
+
+void iir_mem16_8(const Int16 *x, const Int16 *den, Int16 *y, int N, Int32 *mem)
+{
+	register int i;
+	register int c67, c45, c23, c01;
+	register int r1, r2, r3;
+	register int y10, y32, y54, y76, yi;
+	
+	c67 = pack16lsb(-den[6],-den[7]);
+	c45 = pack16lsb(-den[4],-den[5]);
+	c23 = pack16lsb(-den[2],-den[3]);
+	c01 = pack16lsb(-den[0],-den[1]);
+
+	y10 = mem[0];
+	y32 = mem[1];
+	y54 = mem[2];
+	y76 = mem[3];
+
+#if (TM_UNROLL && TM_UNROLL_IIR)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0 ; i < N ; ++i )
+	{		
+		r2 = iadd(ifir16(y10,c67),ifir16(y32,c45));
+		r3 = iadd(ifir16(y54,c23),ifir16(y76,c01));
+		r1 = iadd(r2,r3);
+
+		y10 = funshift2(y32, y10);
+		y32 = funshift2(y54, y32);
+		y54 = funshift2(y76, y54);
+		
+		yi	= iclipi(iadd((int)(x[i]),PSHR32(r1,LPC_SHIFT)),32767);
+		y[i]= yi;
+		y76 = funshift2(yi, y76);
+	}
+
+#if (TM_UNROLL && TM_UNROLL_IIR)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = y10;
+	mem[1] = y32;
+	mem[2] = y54;
+	mem[3] = y76;
+
+}
+
+void iir_mem16_10(const Int16 *x, const Int16 *den, Int16 *y, int N, Int32 *mem)
+{
+	register int i;
+	register int c89, c67, c45, c23, c01;
+	register int r1, r2, r3, r4, r5;
+	register int y10, y32, y54, y76, y98, yi;
+	
+	c89 = pack16lsb(-den[8],-den[9]);
+	c67 = pack16lsb(-den[6],-den[7]);
+	c45 = pack16lsb(-den[4],-den[5]);
+	c23 = pack16lsb(-den[2],-den[3]);
+	c01 = pack16lsb(-den[0],-den[1]);
+
+	y10 = mem[0];
+	y32 = mem[1];
+	y54 = mem[2];
+	y76 = mem[3];
+	y98 = mem[4];
+
+#if (TM_UNROLL && TM_UNROLL_IIR)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0 ; i < N ; ++i )
+	{		
+		r2 = iadd(ifir16(y10,c89),ifir16(y32,c67));
+		r3 = iadd(ifir16(y54,c45),ifir16(y76,c23));
+		r4 = ifir16(y98,c01);
+		r5 = iadd(r2,r3);
+		r1 = iadd(r4,r5);
+
+		y10 = funshift2(y32, y10);
+		y32 = funshift2(y54, y32);
+		y54 = funshift2(y76, y54);
+		y76 = funshift2(y98, y76);
+		
+		yi	= iclipi(iadd((int)(x[i]),PSHR32(r1,LPC_SHIFT)),32767);
+		y[i]= yi;
+		y98 = funshift2(yi, y98);		
+	}
+
+#if (TM_UNROLL && TM_UNROLL_IIR)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = y10;
+	mem[1] = y32;
+	mem[2] = y54;
+	mem[3] = y76;
+	mem[4] = y98;
+}
+
+#define OVERRIDE_IIR_MEM16
+void iir_mem16(const Int16 *x, const Int16 *den, Int16 *y, int N, int ord, Int32 *mem, char *stack)
+{
+	TMDEBUG_ALIGNMEM(den);
+
+	IIRMEM16_START();
+
+	if(ord==10)
+		iir_mem16_10(x, den, y, N, mem);
+	else if (ord==8)
+		iir_mem16_8(x, den, y, N, mem);
+
+#ifndef REMARK_ON
+	(void)stack;
+#endif
+
+	IIRMEM16_STOP();
+}
+
+void fir_mem16_8(const Int16 *x, const Int16 *num, Int16 *y, int N, Int32 *mem)
+{
+	register int i, N_2;
+	register int c67, c45, c23, c01;
+	register int b0, b1, b2, b3;
+	register int r1, r2, r3;
+	register int x10, x32, x54, x76, xi;
+	register Int16 *a; 
+
+	N_2 = N >> 1;
+
+	c67 = ld32x(num,3);
+	c45 = ld32x(num,2);
+	c23 = ld32x(num,1);
+	c01 = ld32x(num,0);
+
+	c67 = funshift2(c67,c67);
+	c45 = funshift2(c45,c45);
+	c23 = funshift2(c23,c23);
+	c01 = funshift2(c01,c01);
+
+	b3 = x76 = ld32x(x,N_2-1);
+	b2 = x54 = ld32x(x,N_2-2);
+	b1 = x32 = ld32x(x,N_2-3);
+	b0 = x10 = ld32x(x,N_2-4);
+
+#if (TM_UNROLL && TM_UNROLL_FILTER > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=N-1 ; i >= 8 ; --i )
+	{
+		xi  = asri(16,x76);
+		x76 = funshift2(x76, x54);
+		x54 = funshift2(x54, x32);
+		x32 = funshift2(x32, x10);
+		x10 = pack16lsb(x10, (int)x[i-8]);
+
+		r2 = iadd(ifir16(x10,c67),ifir16(x32,c45));
+		r3 = iadd(ifir16(x54,c23),ifir16(x76,c01));
+		r1 = iadd(r2,r3);
+		
+		y[i] = iclipi(iadd(xi,PSHR32(r1,LPC_SHIFT)),32767);
+	}
+	for ( i=7, a=(Int16*)mem ; i>=0 ; --i )
+	{
+		xi  = asri(16,x76);
+		x76 = funshift2(x76, x54);
+		x54 = funshift2(x54, x32);
+		x32 = funshift2(x32, x10);
+		x10 = pack16lsb(x10, (int)a[i]);
+	
+		r2 = iadd(ifir16(x10,c67),ifir16(x32,c45));
+		r3 = iadd(ifir16(x54,c23),ifir16(x76,c01));
+		r1 = iadd(r2,r3);
+		
+		y[i] = iclipi(iadd(xi,PSHR32(r1,LPC_SHIFT)),32767);
+	}
+
+#if (TM_UNROLL && TM_UNROLL_FILTER > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = b0;
+	mem[1] = b1;
+	mem[2] = b2;
+	mem[3] = b3;
+}
+
+void fir_mem16_10(const Int16  *x, const Int16 *num, Int16  *y, int N, Int32  *mem)
+{
+	register int N_2, i;
+	register int c89, c67, c45, c23, c01;
+	register int b0, b1, b2, b3, b4;
+	register int r1, r2, r3, r4, r5;
+	register int x10, x32, x54, x76, x98, xi;
+	register Int16 *a; 
+
+	N_2 = N >> 1;
+
+	c89 = ld32x(num,4);
+	c67 = ld32x(num,3);
+	c45 = ld32x(num,2);
+	c23 = ld32x(num,1);
+	c01 = ld32x(num,0);
+
+	c89 = funshift2(c89,c89);
+	c67 = funshift2(c67,c67);
+	c45 = funshift2(c45,c45);
+	c23 = funshift2(c23,c23);
+	c01 = funshift2(c01,c01);
+
+	b4 = x98 = ld32x(x,N_2-1);
+	b3 = x76 = ld32x(x,N_2-2);
+	b2 = x54 = ld32x(x,N_2-3);
+	b1 = x32 = ld32x(x,N_2-4);
+	b0 = x10 = ld32x(x,N_2-5);
+
+#if (TM_UNROLL && TM_UNROLL_FIR > 0)
+#pragma TCS_unroll=5
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=N-1 ; i >= 10 ; --i )
+	{
+		xi  = asri(16,x98);
+		x98 = funshift2(x98, x76);
+		x76 = funshift2(x76, x54);
+		x54 = funshift2(x54, x32);
+		x32 = funshift2(x32, x10);
+		x10 = pack16lsb(x10, (int)(x[i-10]));
+
+		r2 = iadd(ifir16(x10,c89),ifir16(x32,c67));
+		r3 = iadd(ifir16(x54,c45),ifir16(x76,c23));
+		r4 = ifir16(x98,c01);
+		r5 = iadd(r2,r3);
+		r1 = iadd(r4,r5);
+		
+		y[i] = iclipi(iadd(xi,PSHR32(r1,LPC_SHIFT)),32767);
+	}
+
+	for ( i=9,a =(Int16*)mem ; i>=0 ; --i )
+	{
+		xi  = asri(16,x98);
+		x98 = funshift2(x98, x76);
+		x76 = funshift2(x76, x54);
+		x54 = funshift2(x54, x32);
+		x32 = funshift2(x32, x10);
+		x10 = pack16lsb(x10, (int)(a[i]));
+
+		r2 = iadd(ifir16(x10,c89),ifir16(x32,c67));
+		r3 = iadd(ifir16(x54,c45),ifir16(x76,c23));
+		r4 = ifir16(x98,c01);
+		r5 = iadd(r2,r3);
+		r1 = iadd(r4,r5);
+		
+		y[i] = iclipi(iadd(xi,PSHR32(r1,LPC_SHIFT)),32767);
+	}
+
+#if (TM_UNROLL && TM_UNROLL_FIR > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = b0;
+	mem[1] = b1;
+	mem[2] = b2;
+	mem[3] = b3;
+	mem[4] = b4;
+
+
+}
+
+#define OVERRIDE_FIR_MEM16
+void fir_mem16(const spx_word16_t *x, const Int16 *num, spx_word16_t *y, int N, int ord, Int32 *mem, char *stack)
+{
+	TMDEBUG_ALIGNMEM(x);
+	TMDEBUG_ALIGNMEM(y);
+	TMDEBUG_ALIGNMEM(num);
+
+	FIRMEM16_START();
+
+	if(ord==10)
+		fir_mem16_10(x, num, y, N, mem);
+	else if (ord==8)
+		fir_mem16_8(x, num, y, N, mem);
+
+#ifndef REMARK_ON
+   	(void)stack;
+#endif
+
+	FIRMEM16_STOP();
+}
+
+
+
+#define OVERRIDE_SYN_PERCEP_ZERO16
+void syn_percep_zero16(const Int16 *xx, const Int16 *ak, const Int16 *awk1, const Int16 *awk2, Int16 *y, int N, int ord, char *stack)
+{
+	register int i,j;
+	VARDECL(Int32 *mem);
+	ALLOC(mem, ord, Int32);
+	
+	TMDEBUG_ALIGNMEM(mem);
+
+	for ( i=0,j=0 ; i<ord ; ++i,j+=4 )
+		st32d(j,mem,0);
+	iir_mem16(xx, ak, y, N, ord, mem, stack);
+	for ( i=0,j=0 ; i<ord ; ++i,j+=4 )
+		st32d(j,mem,0);
+	filter_mem16(y, awk1, awk2, y, N, ord, mem, stack);
+}
+
+
+#define OVERRIDE_RESIDUE_PERCEP_ZER016
+void residue_percep_zero16(const Int16 *xx, const Int16 *ak, const Int16 *awk1, const Int16 *awk2, Int16 *y, int N, int ord, char *stack)
+{
+	register int i,j;
+	VARDECL(Int32 *mem);
+	ALLOC(mem, ord, Int32);
+
+	TMDEBUG_ALIGNMEM(mem);
+
+	for ( i=0,j=0 ; i<ord ; ++i,j+=4 )
+		st32d(j,mem,0);
+	filter_mem16(xx, ak, awk1, y, N, ord, mem, stack);
+	for ( i=0,j=0 ; i<ord ; ++i,j+=4 )
+		st32d(j,mem,0);
+	fir_mem16(y, awk2, y, N, ord, mem, stack);
+}
+
+
+
+void compute_impulse_response_10(const Int16 *ak, const Int16 *awk1, const Int16 *awk2, Int16 *y, int N)
+{
+	register int awk_01, awk_23, awk_45, awk_67, awk_89;
+	register int y10, y32, y54, y76, y98, yi;
+	register int i, acc0, acc1, N_2;
+
+	N_2		= N << 1;
+
+	awk_01 = ld32(awk1); 
+	awk_23 = ld32x(awk1,1); 
+	awk_45 = ld32x(awk1,2); 
+	awk_67 = ld32x(awk1,3); 
+	awk_89 = ld32x(awk1,4);
+
+	y10 = funshift2(awk_01, LPC_SCALING << 16);
+	st32d(0, y, y10);
+	y32 = funshift2(awk_23, awk_01);
+	st32d(4, y, y32);
+	y54 = funshift2(awk_45, awk_23);
+	st32d(8, y, y54);
+	y76 = funshift2(awk_67, awk_45);
+	st32d(12, y, y76);
+	y98 = funshift2(awk_89, awk_67);
+	st32d(16, y, y98);
+	y10 = funshift2(0, awk_89);
+	st32d(20, y, y10);
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=24 ; i<N_2 ; i+=4 )
+	{	st32d(i, y, 0);
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	y10 = y32 = y54 = y76 = y98 = 0;
+	awk_01 = ld32(awk2); 
+	awk_23 = ld32x(awk2,1); 
+	awk_45 = ld32x(awk2,2); 
+	awk_67 = ld32x(awk2,3); 
+	awk_89 = ld32x(awk2,4);	
+	
+	awk_01 = funshift2(awk_01, awk_01);
+	awk_23 = funshift2(awk_23, awk_23);
+	awk_45 = funshift2(awk_45, awk_45);
+	awk_67 = funshift2(awk_67, awk_67);
+	awk_89 = funshift2(awk_89, awk_89);
+
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<N ; ++i )
+	{
+		yi	= y[i];
+
+		acc0 = ifir16(y10, awk_89) + ifir16(y32, awk_67);
+		acc1 = ifir16(y54, awk_45) + ifir16(y76, awk_23);
+		yi   += PSHR32(acc0 + acc1 + ifir16(y98, awk_01),LPC_SHIFT);
+		y[i] = yi;
+
+		y10 = funshift2(y32, y10);
+		y32 = funshift2(y54, y32);
+		y54 = funshift2(y76, y54);
+		y76 = funshift2(y98, y76);
+		y98 = funshift2(ineg(yi), y98);
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	y10 = y32 = y54 = y76 = y98 = 0;
+	awk_01 = ld32(ak); 
+	awk_23 = ld32x(ak,1); 
+	awk_45 = ld32x(ak,2); 
+	awk_67 = ld32x(ak,3); 
+	awk_89 = ld32x(ak,4);
+	awk_01 = funshift2(awk_01, awk_01);
+	awk_23 = funshift2(awk_23, awk_23);
+	awk_45 = funshift2(awk_45, awk_45);
+	awk_67 = funshift2(awk_67, awk_67);
+	awk_89 = funshift2(awk_89, awk_89);
+	
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<N ; ++i )
+	{
+		yi	= y[i];
+
+		acc0 = ifir16(y10, awk_89) + ifir16(y32, awk_67);
+		acc1 = ifir16(y54, awk_45) + ifir16(y76, awk_23);
+		yi   = PSHR32(SHL32(yi,LPC_SHIFT+1) + acc0 + acc1 + ifir16(y98, awk_01),LPC_SHIFT);
+		y[i] = yi;
+
+		y10 = funshift2(y32, y10);
+		y32 = funshift2(y54, y32);
+		y54 = funshift2(y76, y54);
+		y76 = funshift2(y98, y76);
+		y98 = funshift2(ineg(yi), y98);
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+void compute_impulse_response_8(const Int16 *ak, const Int16 *awk1, const Int16 *awk2, Int16 *y, int N)
+{
+	register int awk_01, awk_23, awk_45, awk_67;
+	register int y10, y32, y54, y76, yi;
+	register int i, acc0, acc1, N_2;
+
+	N_2		= N << 1;
+
+	awk_01 = ld32(awk1); 
+	awk_23 = ld32x(awk1,1); 
+	awk_45 = ld32x(awk1,2); 
+	awk_67 = ld32x(awk1,3);
+
+	y10 = funshift2(awk_01, LPC_SCALING << 16);
+	st32d(0, y, y10);
+	y32 = funshift2(awk_23, awk_01);
+	st32d(4, y, y32);
+	y54 = funshift2(awk_45, awk_23);
+	st32d(8, y, y54);
+	y76 = funshift2(awk_67, awk_45);
+	st32d(12, y, y76);
+	y10 = funshift2(0, awk_67);
+	st32d(16, y, y10);
+	st32d(20, y, 0);
+
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=24 ; i<N_2 ; i+=4 )
+	{	st32d(i, y, 0);
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	y10 = y32 = y54 = y76 = 0;
+	awk_01 = ld32(awk2); 
+	awk_23 = ld32x(awk2,1); 
+	awk_45 = ld32x(awk2,2); 
+	awk_67 = ld32x(awk2,3); 
+	
+	awk_01 = funshift2(awk_01, awk_01);
+	awk_23 = funshift2(awk_23, awk_23);
+	awk_45 = funshift2(awk_45, awk_45);
+	awk_67 = funshift2(awk_67, awk_67);
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<N ; ++i )
+	{
+		yi	= y[i];
+
+		acc0 = ifir16(y10, awk_67) + ifir16(y32, awk_45);
+		acc1 = ifir16(y54, awk_23) + ifir16(y76, awk_01);
+		yi   += PSHR32(acc0 + acc1,LPC_SHIFT);
+		y[i] = yi;
+
+		y10 = funshift2(y32, y10);
+		y32 = funshift2(y54, y32);
+		y54 = funshift2(y76, y54);
+		y76 = funshift2(ineg(yi), y76);
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	y10 = y32 = y54 = y76 = 0;
+	awk_01 = ld32(ak); 
+	awk_23 = ld32x(ak,1); 
+	awk_45 = ld32x(ak,2); 
+	awk_67 = ld32x(ak,3);
+	awk_01 = funshift2(awk_01, awk_01);
+	awk_23 = funshift2(awk_23, awk_23);
+	awk_45 = funshift2(awk_45, awk_45);
+	awk_67 = funshift2(awk_67, awk_67);
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif	
+	for ( i=0 ; i<N ; ++i )
+	{
+		yi	= y[i];
+
+		acc0 = ifir16(y10, awk_67) + ifir16(y32, awk_45);
+		acc1 = ifir16(y54, awk_23) + ifir16(y76, awk_01);
+		yi   = PSHR32(SHL32(yi,LPC_SHIFT+1) + acc0 + acc1,LPC_SHIFT);
+		y[i] = yi;
+
+		y10 = funshift2(y32, y10);
+		y32 = funshift2(y54, y32);
+		y54 = funshift2(y76, y54);
+		y76 = funshift2(ineg(yi), y76);
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEIMPULSERESPONSE > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+
+#define OVERRIDE_COMPUTE_IMPULSE_RESPONSE
+void compute_impulse_response(const Int16 *ak, const Int16 *awk1, const Int16 *awk2, Int16 *y, int N, int ord, char *stack)
+{
+	TMDEBUG_ALIGNMEM(ak);
+	TMDEBUG_ALIGNMEM(awk1);
+	TMDEBUG_ALIGNMEM(awk2);
+	TMDEBUG_ALIGNMEM(y);
+
+	COMPUTEIMPULSERESPONSE_START();
+	if ( ord == 10 )
+		compute_impulse_response_10(ak,awk1,awk2,y,N);
+	else
+		compute_impulse_response_8(ak,awk1,awk2,y,N);
+
+	(void)stack;
+
+	COMPUTEIMPULSERESPONSE_STOP();
+}
+
+
+#define OVERRIDE_QMFSYNTH
+void qmf_synth(const Int16 *x1, const Int16 *x2, const Int16 *a, Int16 *y, int N, int M, Int32 *mem1, Int32 *mem2, char *stack)
+   /* assumptions:
+      all odd x[i] are zero -- well, actually they are left out of the array now
+      N and M are multiples of 4 */
+{
+	register int i, j;
+	register int M2, N2;
+	VARDECL(int *x12);
+	M2 = M>>1;
+	N2 = N>>1;
+	ALLOC(x12, M2+N2, int);
+
+
+	TMDEBUG_ALIGNMEM(a);
+	TMDEBUG_ALIGNMEM(x12);
+	TMDEBUG_ALIGNMEM(mem1);
+	TMDEBUG_ALIGNMEM(mem2);
+
+	QMFSYNTH_START();
+   
+#if (TM_UNROLL && TM_UNROLL_QMFSYNTH > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif	
+	for ( i=0 ; i<N2 ; ++i )
+	{	register int index = N2-1-i;
+		x12[i] = pack16lsb(x1[index],x2[index]);
+	}
+
+	for ( j= 0; j<M2 ; ++j)
+	{	register int index = (j << 1) + 1;
+		x12[N2+j] = pack16lsb(mem1[index],mem2[index]);
+	}
+#if (TM_UNROLL && TM_UNROLL_QMFSYNTH > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif	 
+	for (i = 0; i < N2; i += 2) 
+	{
+		register int y0, y1, y2, y3;
+		register int x12_0;
+
+		y0 = y1 = y2 = y3 = 0;
+		x12_0 = x12[N2-2-i];
+
+		for (j = 0; j < M2; j += 2) 
+		{
+			register int x12_1;
+			register int a10, a11, a0_0;
+			register int _a10, _a11, _a0_0;
+
+			a10 = ld32x(a,j);
+			a11 = pack16msb(a10,a10);
+			a0_0= pack16lsb(a10,ineg(sex16(a10)));
+			x12_1 = x12[N2-1+j-i];
+
+			y0 += ifir16(a0_0,x12_1);
+			y1 += ifir16(a11, x12_1);
+			y2 += ifir16(a0_0,x12_0);
+			y3 += ifir16(a11 ,x12_0);
+
+
+			_a10   = ld32x(a,j+1);
+			_a11   = pack16msb(_a10,_a10);
+			_a0_0  = pack16lsb(_a10,ineg(sex16(_a10)));
+			x12_0  = x12[N2+j-i];
+			
+			y0 += ifir16(_a0_0,x12_0);
+			y1 += ifir16(_a11, x12_0);
+			y2 += ifir16(_a0_0,x12_1);
+			y3 += ifir16(_a11 ,x12_1);
+
+		}
+		y[2*i]   = EXTRACT16(SATURATE32(PSHR32(y0,15),32767));
+		y[2*i+1] = EXTRACT16(SATURATE32(PSHR32(y1,15),32767));
+		y[2*i+2] = EXTRACT16(SATURATE32(PSHR32(y2,15),32767));
+		y[2*i+3] = EXTRACT16(SATURATE32(PSHR32(y3,15),32767));
+	}
+
+#if (TM_UNROLL && TM_UNROLL_QMFSYNTH > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif	
+	for (i = 0; i < M2; ++i)
+	{	mem1[2*i+1] = asri(16,x12[i]);
+		mem2[2*i+1] = sex16(x12[i]);
+	}
+#if (TM_UNROLL && TM_UNROLL_QMFSYNTH > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif	 
+
+	QMFSYNTH_STOP();
+}
+
+
+#define OVERRIDE_QMFDECOMP
+void qmf_decomp(const Int16 *xx, const Int16 *aa, Int16 *y1, Int16 *y2, int N, int M, Int16 *mem, char *stack)
+{
+	VARDECL(int *_a);
+	VARDECL(int *_x);
+	register int i, j, k, MM, M2, N2;  
+	register int _xx10, _mm10;
+	register int *_x2;
+
+	M2=M>>1;
+	N2=N>>1;
+	MM=(M-2)<<1;
+
+	ALLOC(_a, M2, int);
+	ALLOC(_x, N2+M2, int);
+	_x2 = _x + M2 - 1;
+
+	TMDEBUG_ALIGNMEM(xx);
+	TMDEBUG_ALIGNMEM(aa);
+	TMDEBUG_ALIGNMEM(y1);
+	TMDEBUG_ALIGNMEM(y2);
+	TMDEBUG_ALIGNMEM(mem);
+	TMDEBUG_ALIGNMEM(_a);
+	TMDEBUG_ALIGNMEM(_x);
+
+	QMFDECOMP_START();
+
+	_xx10 = ld32(xx);
+	_xx10 = dualasr(_xx10,1);
+	_mm10 = ld32(mem);
+	_x2[0] = pack16lsb(_xx10,_mm10);
+
+#if (TM_UNROLL && TM_UNROLL_QMFSYNTH > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif	
+	for ( i=0 ; i<M2 ; ++i )
+	{	register int a10;
+		
+		a10 = ld32x(aa,i);
+		a10 = funshift2(a10, a10);
+		_a[M2-i-1] = a10;		
+	}
+
+	for ( j=1 ; j<N2 ; ++j )
+	{	register int _xx32;
+
+		_xx32   = ld32x(xx,j);
+		_xx32   = dualasr(_xx32,1);
+		_x2[j]  = funshift2(_xx32, _xx10);
+		_xx10   = _xx32;
+	}
+
+	for ( k=1 ; k<M2; ++k )
+	{	register int _mm32;
+
+		_mm32   = ld32x(mem,k);
+		_mm10   = funshift2(_mm10,_mm10);
+		_x2[-k] = pack16lsb(_mm10,_mm32);
+		_mm10 = _mm32;
+	}
+
+
+	for ( i=N2-1,j=0 ; j<MM ; --i,j+=4 )
+	{	register int _xx;
+
+		_xx = ld32x(xx,i);
+		_xx = dualasr(_xx,1);
+		_xx = funshift2(_xx,_xx);
+		st32d(j, mem, _xx);
+	}
+#if (TM_UNROLL && TM_UNROLL_QMFSYNTH > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	mem[M-2] = xx[N-M+1] >> 1;
+
+
+	M2 >>= 1;
+	for ( i=0 ; i<N2 ; ++i )
+	{	register int y1k, y2k;
+
+		y1k = y2k = 0;
+
+		for ( j=0 ; j<M2 ; j+=2 )
+		{	register int _aa, _acc0, _acc1;
+			register int __xx10, __mm10, __acc0, __acc1, __aa;
+			register int _tmp0, _tmp1, _tmp2, _tmp3;
+
+			_xx10 = ld32x(_x, i+j);
+			_mm10 = ld32x(_x2,i-j);
+			_aa	  = ld32x(_a, j);
+			_mm10 = funshift2(_mm10,_mm10);
+			_acc0 = dspidualadd(_xx10, _mm10);
+			_acc1 = dspidualsub(_xx10, _mm10);
+
+			__xx10 = ld32x(_x, i+j+1);
+			__mm10 = ld32x(_x2,i-j-1);
+			__aa   = ld32x(_a, j+1);
+			__mm10 = funshift2(__mm10,__mm10);
+			__acc0 = dspidualadd(__xx10, __mm10);
+			__acc1 = dspidualsub(__xx10, __mm10);
+
+			y1k   += ifir16(_aa, _acc0);
+			y1k   += ifir16(__aa, __acc0);
+
+			_tmp0 = pack16lsb(_aa,__aa);
+			_tmp1 = pack16msb(_aa,__aa);
+			_tmp2 = pack16lsb(_acc1, __acc1);
+			_tmp3 = pack16msb(_acc1, __acc1);
+
+			y2k	  -= ifir16(_tmp0, _tmp2);
+			y2k	  += ifir16(_tmp1, _tmp3);
+
+		}
+
+		y1[i] = iclipi(PSHR32(y1k,15),32767);
+		y2[i] = iclipi(PSHR32(y2k,15),32767);
+	}
+
+	QMFDECOMP_STOP();
+}
+
+#endif
+
diff --git a/tmv/fixed_tm.h b/tmv/fixed_tm.h
new file mode 100644
index 0000000..c8fa968
--- /dev/null
+++ b/tmv/fixed_tm.h
@@ -0,0 +1,100 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file fixed_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef FIXED_TM_H
+#define FIXED_TM_H
+
+#include <ops/custom_defs.h>
+
+
+#undef	SATURATE
+#undef	SATURATE16
+#undef	SATURATE32
+#define SATURATE(x,a)			iclipi(x,a)
+#define SATURATE16(x,a) 		iclipi(x,a)
+#define SATURATE32(x,a) 		iclipi(x,a)
+
+#undef	EXTEND32
+#define	EXTEND32(x)				sex16(x)
+
+#undef	NEG16
+#undef	NEG32
+#define NEG16(x)				ineg((int)(x))
+#define NEG32(x)				ineg(x)
+
+#undef	ABS
+#undef	ABS16
+#undef	ABS32
+#define	ABS(x)					iabs(x)
+#define	ABS32(x)				iabs(x)
+#define	ABS16(x)				iabs((int)(x))
+
+#undef	MIN16
+#undef	MIN32
+#define	MIN16(a,b)				imin((int)(a),(int)(b))
+#define	MIN32(a,b)				imin(a,b)
+
+#undef	MAX16
+#undef	MAX32
+#define	MAX16(a,b)				imax((int)(a),(int)(b))
+#define	MAX32(a,b)				imax(a,b)
+
+#undef	ADD16
+#undef	SUB16
+#undef	ADD32
+#undef	SUB32
+#undef	MULT16_16
+#undef	MULT16_16_16		
+
+#define ADD16(a,b)				((int)(a) + (int)(b))
+#define SUB16(a,b)				((int)(a) - (int)(b))
+#define ADD32(a,b)				((int)(a) + (int)(b))
+#define SUB32(a,b)				((int)(a) - (int)(b))
+#define MULT16_16_16(a,b)		((int)(a) * (int)(b))
+#define MULT16_16(a,b)			((int)(a) * (int)(b))
+
+#if TM_DEBUGMEM_ALIGNNMENT
+#include <stdio.h>
+#define TMDEBUG_ALIGNMEM(x)																	\
+		{	if( ((int)(x) & (int)(0x00000003)) != 0 )										\
+			{	printf("memory not align. file: %s, line: %d\n",  __FILE__, __LINE__);		\
+			}																				\
+		}
+
+#else
+#define TMDEBUG_ALIGNMEM(x)
+#endif
+
+#endif
+
diff --git a/tmv/kiss_fft_tm.h b/tmv/kiss_fft_tm.h
new file mode 100644
index 0000000..dce2072
--- /dev/null
+++ b/tmv/kiss_fft_tm.h
@@ -0,0 +1,599 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file kiss_fft_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "_kiss_fft_guts_tm.h"
+
+#ifdef TM_ASM
+
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_KFBFLY2
+static void kf_bfly2(
+		kiss_fft_cpx		*Fout,
+        const int			fstride,
+        const kiss_fft_cfg	st,
+        int					m
+        )
+{
+	register int * restrict Fout2;
+    register int * restrict tw1 = (int*)st->twiddles;
+    register int i, j;
+	register int _inv = !st->inverse;
+    
+	Fout2 = (int*)Fout + m;
+	
+	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride )
+	{	register int tw_10, ff_10, f2_10;
+
+		ff_10	= ld32x(Fout, i);
+		f2_10	= ld32x(Fout2, i);
+		tw_10	= ld32(tw1);
+		
+		if ( _inv ) 
+		{	TM_SHR(f2_10, f2_10, 1);
+			TM_SHR(ff_10, ff_10, 1);
+		}
+
+		TM_MUL(tw_10, tw_10, f2_10);
+		TM_SUB(f2_10, ff_10, tw_10);
+		TM_ADD(ff_10, ff_10, tw_10);
+
+		st32d(j, Fout2, f2_10);
+		st32d(j,  Fout, ff_10);
+	}
+}
+
+#define OVERRIDE_KFBFLY4
+static void kf_bfly4(
+        kiss_fft_cpx		*Fout,
+        const int			fstride,
+        const kiss_fft_cfg	st,
+        const int			m
+        )
+{
+    register int * restrict tw1;
+	register int * restrict tw2;
+	register int * restrict tw3;
+	register int * restrict Fout1;
+	register int * restrict Fout2;
+	register int * restrict Fout3;
+	register int i, j;
+	register int fstride2, fstride3;
+	register int _inv = !st->inverse;
+
+	tw3  = tw2 = tw1 = (int*)st->twiddles;
+	fstride2 = fstride << 1;
+	fstride3 = fstride * 3;
+
+	Fout1 = (int*)Fout + m;
+	Fout2 = (int*)Fout + (m << 1);
+	Fout3 = (int*)Fout + (m *  3);
+
+
+	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 )
+	{	register int sc0, sc1, sc2, sc3, sc4, sc5;
+		register int ff0;
+
+		sc0   = ld32x(Fout1,i);
+		sc3   = ld32(tw1);
+		sc1   = ld32x(Fout2, i);
+		sc4   = ld32(tw2);			
+		sc2   = ld32x(Fout3, i);
+		sc5   = ld32(tw3);
+		ff0   = ld32x(Fout,i);
+		
+		if ( _inv )
+		{
+			TM_ADD(sc0, sc0, 0x00020002);
+			TM_ADD(sc1, sc1, 0x00020002);
+			TM_ADD(sc2, sc2, 0x00020002);
+			TM_ADD(ff0, ff0, 0x00020002);
+			TM_SHR(sc0, sc0, 2);
+			TM_SHR(sc1, sc1, 2);
+			TM_SHR(sc2, sc2, 2);
+			TM_SHR(ff0, ff0, 2);
+		}
+
+		TM_MUL(sc0, sc0, sc3);
+		TM_MUL(sc1, sc1, sc4);
+		TM_MUL(sc2, sc2, sc5);
+		TM_SUB(sc5, ff0, sc1);
+		TM_ADD(ff0, ff0, sc1);
+		TM_ADD(sc3, sc0, sc2);
+		TM_SUB(sc4, sc0, sc2);
+		TM_SUB(sc1, ff0, sc3);
+		TM_ADD(ff0, ff0, sc3);
+		
+		st32d(j, Fout2, sc1);
+		st32d(j, Fout,  ff0);
+
+		sc5 = funshift2(sc5, sc5);
+		
+		if ( _inv )
+		{	TM_ADD(ff0, sc5, sc4);
+			TM_SUB(sc1, sc5, sc4);
+		} else
+		{	TM_ADD(sc1, sc5, sc4);
+			TM_SUB(ff0, sc5, sc4);
+		}
+
+		sc0 = funshift2(sc1, ff0);
+		sc2 = funshift2(ff0, sc1);
+
+		st32d(j, Fout1, sc0);
+		st32d(j, Fout3, sc2);
+	}
+}
+
+
+#define OVERRIDE_KFBFLY3
+static void kf_bfly3(
+         kiss_fft_cpx	*Fout,
+         const int		fstride,
+         const			kiss_fft_cfg st,
+         int			m
+         )
+{
+    register int * restrict tw1;
+	register int * restrict tw2;
+	register int * restrict Fout1;
+	register int * restrict Fout2;
+    register int epi;
+	register int i, j;
+	register int fstride2;
+	register int _inv = !st->inverse;
+
+    tw1  = tw2 = (int*)st->twiddles;
+	Fout1 = (int*)Fout + m;
+	Fout2 = (int*)Fout + (m << 1);
+	epi = tw1[fstride*m];
+    epi = pack16lsb(epi,epi);
+	fstride2 = fstride << 1;
+
+	 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 )
+	 {	register int sc0, sc1, sc2, sc3, sc4, sc5;
+		register int ff0;
+
+		sc1 = ld32x(Fout1,i);
+		sc2 = ld32x(Fout2,i);
+		sc3 = ld32(tw1);
+		sc4 = ld32(tw2);
+		ff0 = ld32x(Fout,i);
+		
+		if ( _inv )
+		{
+			TM_DIV(sc1, sc1, 3);
+			TM_DIV(sc2, sc2, 3);
+			TM_DIV(ff0, ff0, 3);
+		}
+
+		TM_MUL(sc1, sc1,  sc3);
+		TM_MUL(sc2, sc2,  sc4);
+		TM_ADD(sc3, sc1,  sc2);
+		TM_SUB(sc0, sc1,  sc2);
+		TM_SHR(sc4, sc3,    1);
+		TM_SUB(sc1, ff0,  sc4);
+
+		sc0 = dspidualmul(sc0, epi);
+		sc0 = funshift2(sc0, sc0);
+
+		TM_ADD(ff0, ff0, sc3);
+		TM_ADD(sc4, sc1, sc0);
+		TM_SUB(sc5, sc1, sc0);
+
+		sc1 = funshift2(sc4, sc5);
+		sc2 = funshift2(sc5, sc4);
+		sc2 = funshift2(sc2, sc2);
+
+		st32d(j, Fout1, sc1);
+		st32d(j, Fout,  ff0);
+		st32d(j, Fout2, sc2);
+	 }
+}
+
+
+#define OVERRIDE_KFBFLY5
+static void kf_bfly5(
+        kiss_fft_cpx		*Fout,
+        const int			fstride,
+        const kiss_fft_cfg	st,
+        int m
+        )
+{   
+    register int * restrict tw1;
+	register int * restrict tw2;
+	register int * restrict tw3;
+	register int * restrict tw4;
+	register int * restrict Fout1;
+	register int * restrict Fout2;
+	register int * restrict Fout3;
+	register int * restrict Fout4;
+	register int fstride2, fstride3, fstride4;
+	register int i, j;
+	register int yab_msb, yab_lsb, yba_msb, yba_lsb;
+	register int _inv = !st->inverse;
+
+
+    Fout1=(int*)Fout+m;
+    Fout2=(int*)Fout+(m<<1);
+    Fout3=(int*)Fout+(3 *m);
+    Fout4=(int*)Fout+(m<<2);
+
+    tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles;
+
+	i = tw1[fstride*m];
+    yab_lsb = tw1[fstride*(m<<1)];
+	yab_msb = pack16msb(i, yab_lsb);
+	yab_lsb = pack16lsb(i, yab_lsb);
+	yba_msb = funshift2(-sex16(yab_msb), yab_msb);
+	yba_lsb = funshift2(yab_lsb, yab_lsb);
+
+	fstride2 = fstride << 1;
+	fstride3 = fstride *  3;
+	fstride4 = fstride << 2;
+
+	for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 )
+	{	register int sc0, sc1, sc2, sc3, sc4, sc5, sc6;
+		register int sc7, sc8, sc9, sc10, sc11, sc12;
+		register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb;
+
+		sc0 = ld32x(Fout,i);
+		sc1 = ld32x(Fout1,i);
+		sc2 = ld32x(Fout2,i);
+		sc3 = ld32x(Fout3,i);
+		sc4 = ld32x(Fout4,i);
+		sc5 = ld32(tw1);
+		sc6 = ld32(tw2);
+		sc7 = ld32(tw3);
+		sc8 = ld32(tw4);
+
+		if ( _inv )
+		{
+			TM_DIV(sc0, sc0, 5);
+			TM_DIV(sc1, sc1, 5);
+			TM_DIV(sc2, sc2, 5);
+			TM_DIV(sc3, sc3, 5);
+			TM_DIV(sc4, sc4, 5);
+		}
+
+		ff0 = sc0;
+
+		TM_MUL(sc1, sc1, sc5);
+		TM_MUL(sc2, sc2, sc6);
+		TM_MUL(sc3, sc3, sc7);
+		TM_MUL(sc4, sc4, sc8);
+		TM_ADD(sc7, sc1, sc4);
+		TM_SUB(sc10,sc1, sc4);
+		TM_ADD(sc8, sc2, sc3);
+		TM_SUB(sc9, sc2, sc3);
+
+		TM_ADD(ff0, ff0, sc7);
+		TM_ADD(ff0, ff0, sc8);
+		st32d(j, Fout,  ff0);
+
+		sc78_msb = pack16msb(sc7,sc8);
+		sc78_lsb = pack16lsb(sc7,sc8);
+		sc90_msb = pack16msb(sc10,sc9);
+		sc90_lsb = pack16lsb(sc10,sc9);
+		
+		sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb)));
+		sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb)));
+
+		TM_ADD(sc5, sc5, sc0);
+		TM_SUB(sc1, sc5, sc6);
+		TM_ADD(sc4, sc5, sc6);
+		st32d(j, Fout1, sc1);
+		st32d(j, Fout4, sc4);
+
+		sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb)));
+		sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb)));
+
+		TM_ADD(sc11, sc11, sc0);
+		TM_ADD(sc2, sc11, sc12);
+		TM_SUB(sc3, sc11, sc12);
+		st32d(j, Fout2, sc2);
+		st32d(j, Fout3, sc3);
+
+	}
+}
+
+
+#define OVERRIDE_KF_BFLY_GENERIC
+static void kf_bfly_generic(
+        kiss_fft_cpx * restrict Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m,
+        int p
+        )
+{
+	register int _inv = !st->inverse;
+	register int i, j, k, l;
+    register int * restrict twiddles = (int*)st->twiddles;
+    register int Norig = st->nfft;
+
+    CHECKBUF(scratchbuf,nscratchbuf,p);
+
+    for ( i=0; i<m; ++i ) 
+	{	register int sc10;
+
+        for ( j=0,k=i ; j<p ; ++j,k+=m ) 
+		{	register int f10;
+
+			f10 = ld32x(Fout,k);
+
+			if ( _inv ) 
+			{	TM_DIV(f10, f10, p);
+			}
+
+			st32d(j<<2, scratchbuf, f10);
+        }
+
+        for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m ) 
+		{
+            register int twidx = 0;
+			register int f10;
+
+            for ( l=1,f10 = sc10 ; l<p ; ++l ) 
+			{	register int tw, sc;
+
+                twidx += fstride * k;
+				if ( twidx>=Norig ) 
+				{	twidx -= Norig;
+				}
+				
+				sc = ld32x(scratchbuf,l);
+				tw = ld32x(twiddles,twidx);
+                
+				TM_MUL(sc, sc, tw);
+				TM_ADD(f10, f10, sc);
+			}
+			st32d(k<<2, Fout, f10); 
+		}
+	}
+}
+
+#else
+
+#define OVERRIDE_KFBFLY2
+static void kf_bfly2(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    register kiss_fft_cpx * restrict Fout2;
+    register kiss_fft_cpx * restrict tw1 = st->twiddles;
+
+    Fout2 = Fout + m;
+
+    do
+	{
+		register kiss_fft_cpx _fout2, _fout, t;
+		
+		_fout2 = *Fout2;
+		_fout  = *Fout;
+		
+        C_MUL	(	  t,  _fout2,   *tw1);
+        C_SUB	(_fout2,   _fout,	   t);
+        C_ADD	(_fout,    _fout,	   t);
+		
+		*Fout2 = _fout2;
+		*Fout  = _fout;
+		
+		tw1	+= fstride;
+        ++Fout2;
+        ++Fout;
+
+    } while ( --m );
+}
+
+#define OVERRIDE_KFBFLY4
+static void kf_bfly4(
+        kiss_fft_cpx * Fout,
+        const int fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3;
+	register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3; 
+	register int _inv = !st->inverse;
+	
+    tw3 = tw2 = tw1 = st->twiddles;
+	
+	Fout1 = Fout + m;
+	Fout2 = Fout + (m << 1);
+	Fout3 = Fout + (m * 3);
+
+	do {
+	   
+		register kiss_fft_cpx _fout;
+		register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5;
+		
+		_fout = *Fout;
+		
+		C_MUL(   sc0,*Fout1, *tw1);
+		C_MUL(   sc1,*Fout2, *tw2);
+		C_MUL(   sc2,*Fout3, *tw3);
+		C_SUB(   sc5, _fout,  sc1);
+		C_ADD( _fout, _fout,  sc1);
+		C_ADD(   sc3,   sc0,  sc2);
+		C_SUB(   sc4,   sc0,  sc2);
+		C_SUB(*Fout2, _fout,  sc3);
+		C_ADD( *Fout, _fout,  sc3);
+
+		tw1 += fstride;		
+		tw2 += (fstride << 1);
+		tw3 += (fstride *  3);
+		
+		if ( _inv )
+		{
+			Fout1->r = sc5.r + sc4.i;
+			Fout1->i = sc5.i - sc4.r;
+			Fout3->r = sc5.r - sc4.i;
+			Fout3->i = sc5.i + sc4.r;
+		}
+		else
+		{	Fout1->r = sc5.r - sc4.i;
+			Fout1->i = sc5.i + sc4.r;
+			Fout3->r = sc5.r + sc4.i;
+			Fout3->i = sc5.i - sc4.r;
+		}
+		  
+		  
+        ++Fout; ++Fout1; ++Fout2; ++Fout3;
+		
+    } while(--m);
+}
+
+#define OVERRIDE_KFBFLY3
+static void kf_bfly3(
+         kiss_fft_cpx * Fout,
+         const int fstride,
+         const kiss_fft_cfg st,
+         int m
+         )
+{
+	register kiss_fft_cpx * restrict Fout1, * restrict Fout2;
+	register kiss_fft_cpx * restrict tw1,* restrict tw2;
+	register float epi;
+	
+    tw1 = tw2 = st->twiddles;
+    epi = st->twiddles[fstride*m].i;
+	Fout1 = Fout + m;
+	Fout2 = Fout + (m << 1);
+
+    do {
+        
+		register kiss_fft_cpx _fout;
+		register kiss_fft_cpx sc0, sc1, sc2, sc3;
+		
+		_fout = *Fout;
+		
+        C_MUL(   sc1, *Fout1,  *tw1);
+        C_MUL(   sc2, *Fout2,  *tw2);
+        C_ADD(	 sc3,    sc1,   sc2);
+        C_SUB(   sc0,    sc1,   sc2);
+        tw1 += fstride;
+        tw2 += (fstride << 1);
+		
+        sc1.r = _fout.r - HALF_OF(sc3.r);
+        sc1.i = _fout.i - HALF_OF(sc3.i);
+		
+        C_MULBYSCALAR(sc0,  epi);
+        C_ADD(*Fout, _fout, sc3);
+		
+        Fout2->r = sc1.r + sc0.i;
+        Fout2->i = sc1.i - sc0.r;
+		
+        Fout1->r = sc1.i - sc0.i;
+        Fout1->i = sc1.r + sc0.r;
+		
+        ++Fout; ++Fout1; ++Fout2;
+		
+	} while(--m);
+}
+
+#define OVERRIDE_KFBFLY5
+static void kf_bfly5(
+        kiss_fft_cpx * Fout,
+        const size_t fstride,
+        const kiss_fft_cfg st,
+        int m
+        )
+{
+    register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4;
+	register int u;
+    register kiss_fft_cpx *tw;
+    register float yar, yai, ybr, ybi;
+
+    Fout1=Fout+m;
+    Fout2=Fout+(m<<1);
+    Fout3=Fout+(m*3);
+    Fout4=Fout+(m<<2);
+
+    tw = st->twiddles;
+    yar = tw[fstride*m].r;
+	yai = tw[fstride*m].i;
+    ybr = tw[fstride*2*m].r;
+	ybi = tw[fstride*2*m].i;
+   
+	for ( u=0; u<m; ++u )
+	{
+		register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12;
+		
+		sc0 = *Fout;
+		
+        C_MUL(   sc1,*Fout1,   tw[u*fstride]);
+        C_MUL(   sc2,*Fout2, tw[2*u*fstride]);
+        C_MUL(   sc3,*Fout3, tw[3*u*fstride]);
+        C_MUL(   sc4,*Fout4, tw[4*u*fstride]);
+		
+        C_ADD(   sc7,   sc1,   sc4);
+        C_SUB(  sc10,   sc1,   sc4);
+        C_ADD(   sc8,   sc2,   sc3);
+        C_SUB(   sc9,   sc2,   sc3);
+		
+        Fout->r = sc0.r + sc7.r + sc8.r;
+        Fout->i = sc0.i + sc7.i + sc8.i;
+		
+        sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr);
+        sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr);
+		
+        sc6.r =  S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi);
+        sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi);
+		
+        C_SUB(*Fout1,sc5,sc6);
+        C_ADD(*Fout4,sc5,sc6);
+		
+        sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar);
+        sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar);
+        sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai);
+        sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai);
+        C_ADD(*Fout2,sc11,sc12);
+        C_SUB(*Fout3,sc11,sc12);
+		
+        ++Fout1; ++Fout2; ++Fout3; ++Fout4;
+	}
+}
+
+
+#endif
+
+#endif
diff --git a/tmv/kiss_fftr_tm.h b/tmv/kiss_fftr_tm.h
new file mode 100644
index 0000000..5b2b2cf
--- /dev/null
+++ b/tmv/kiss_fftr_tm.h
@@ -0,0 +1,235 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file kiss_fftr_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "_kiss_fft_guts_tm.h"
+
+#ifdef TM_ASM
+
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define TM_NDIV(res,c,frac)															\
+	{	register int c1, c0;														\
+																					\
+		c1 = -asri(16,(c));															\
+		c0 = sex16((c));															\
+		(res) = pack16lsb(sround(c1 * (32767/(frac))), sround(c0 * (32767/(frac))));\
+	}	
+
+
+#define OVERRIDE_KISS_FFTR
+void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar * restrict timedata, kiss_fft_cpx * restrict freqdata)
+{
+	register int ncfft, ncfft2, k;
+	register int * restrict tmpbuf;
+	register int * restrict twiddles;
+
+	ncfft = st->substate->nfft;
+	ncfft2 = ncfft >> 1;
+	tmpbuf = (int*)st->tmpbuf;
+	twiddles = (int*)st->super_twiddles;
+
+	TMDEBUG_ALIGNMEM(timedata);
+	TMDEBUG_ALIGNMEM(freqdata);
+	TMDEBUG_ALIGNMEM(tmpbuf);
+	TMDEBUG_ALIGNMEM(twiddles);
+
+	kiss_fft(st->substate , (const kiss_fft_cpx*)timedata, st->tmpbuf);
+
+	 {
+		register int tdcr, tdci;
+		tdcr = sround(st->tmpbuf[0].r * (32767/2));
+		tdci = sround(st->tmpbuf[0].i * (32767/2));
+
+		freqdata[0].r = tdcr + tdci;
+		freqdata[ncfft].r = tdcr - tdci;
+		freqdata[ncfft].i = freqdata[0].i = 0;
+	 }
+
+	 for ( k=1 ; k <= ncfft2 ; ++k ) 
+	 {	
+		register int fpk, fpnk, i, tw, f1k, f2k;
+		register int fq1, fq2;
+
+		i = ncfft-k;
+
+		fpk  = ld32x(tmpbuf,k);
+		tw   = ld32x(twiddles,k);
+		fpnk = ld32x(tmpbuf,i);
+
+		TM_DIV(fpk, fpk, 2);
+		TM_NDIV(fpnk,fpnk,2);
+ 
+        TM_ADD( f1k, fpk , fpnk );
+        TM_SUB( f2k, fpk , fpnk );
+		TM_MUL( tw , f2k, tw );
+		TM_ADD( fq1, f1k, tw );
+		TM_SHR( fq1, fq1, 1  );
+		TM_SUB( fq2, f1k, tw );
+		TM_NEGMSB( fq2, fq2 );
+		TM_SHR( fq2, fq2, 1 );
+
+
+		st32d( k<<2, freqdata, fq1 );
+		st32d( i<<2, freqdata, fq2 );
+    }
+}
+
+#define OVERRIDE_KISS_FFTRI
+void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx * restrict freqdata,kiss_fft_scalar * restrict timedata)
+{
+	register int k, ncfft, ncfft2;
+	register int * restrict tmpbuf;
+	register int * restrict twiddles;
+
+    ncfft = st->substate->nfft;
+	ncfft2 = ncfft >> 1;
+	tmpbuf = (int*)st->tmpbuf;
+	twiddles = (int*)st->super_twiddles;
+
+	TMDEBUG_ALIGNMEM(freqdata);
+	TMDEBUG_ALIGNMEM(timedata);
+	TMDEBUG_ALIGNMEM(tmpbuf);
+	TMDEBUG_ALIGNMEM(twiddles);
+
+	{
+		register int fqr, fqnr;
+
+		fqr  = freqdata[0].r;
+		fqnr = freqdata[ncfft].r;
+
+		st->tmpbuf[0].r = fqr + fqnr;
+		st->tmpbuf[0].i = fqr - fqnr;
+	}
+
+    for ( k=1 ; k <= ncfft2 ; ++k ) 
+	{
+		register int fk, fnkc, i, tw, fek, fok, tmp;
+		register int tbk, tbn;
+
+		i = ncfft-k;
+
+		fk = ld32x(freqdata,k);
+		tw = ld32x(twiddles,k);
+		fnkc = pack16lsb(-freqdata[i].i, freqdata[i].r);
+	
+        TM_ADD (fek, fk, fnkc);
+        TM_SUB (tmp, fk, fnkc);
+        TM_MUL (fok, tmp, tw );
+		TM_ADD (tbk, fek, fok);
+		TM_SUB (tbn, fek, fok);
+		TM_NEGMSB(tbn, tbn);
+
+		st32d(k<<2, tmpbuf, tbk);
+		st32d(i<<2, tmpbuf, tbn);
+    }
+    kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+}
+
+#else
+
+#define OVERRIDE_KISS_FFTR
+void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar * restrict timedata,kiss_fft_cpx * restrict freqdata)
+{
+    register kiss_fft_cpx fpnk, fpk, f1k, f2k, twk;
+    register int k, ncfft;
+	register kiss_fft_cpx * restrict tmpbuf, * restrict tw;
+	register float tdcr, tdci;
+
+    ncfft = st->substate->nfft;
+	tmpbuf= st->tmpbuf;
+	tw	  = st->super_twiddles;
+
+	kiss_fft( st->substate , (const kiss_fft_cpx*)timedata, tmpbuf );
+
+	tdcr = tmpbuf[0].r;
+    tdci = tmpbuf[0].i;
+    
+	freqdata[0].r = tdcr + tdci;
+    freqdata[ncfft].r = tdcr - tdci;
+    freqdata[ncfft].i = freqdata[0].i = 0;
+
+    for ( k=1;k <= ncfft/2 ; ++k ) 
+	{
+        fpk    = tmpbuf[k]; 
+        fpnk.r = tmpbuf[ncfft-k].r;
+        fpnk.i = -tmpbuf[ncfft-k].i;
+
+        C_ADD( f1k, fpk , fpnk );
+        C_SUB( f2k, fpk , fpnk );
+        C_MUL( twk, f2k , tw[k]);
+
+        freqdata[k].r = HALF_OF(f1k.r + twk.r);
+        freqdata[k].i = HALF_OF(f1k.i + twk.i);
+        freqdata[ncfft-k].r = HALF_OF(f1k.r - twk.r);
+        freqdata[ncfft-k].i = HALF_OF(twk.i - f1k.i);
+    }
+}
+
+#define OVERRIDE_KISS_FFTRI
+void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx * restrict freqdata,kiss_fft_scalar * restrict timedata)
+{
+    register int k, ncfft;
+	register kiss_fft_cpx * restrict tmpbuf, * restrict tw;
+	
+		
+    ncfft = st->substate->nfft;
+	tmpbuf= st->tmpbuf;
+	tw	  = st->super_twiddles;
+		
+    tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
+    tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
+
+    for (k = 1; k <= ncfft / 2; ++k) 
+	{
+        register kiss_fft_cpx fk, fnkc, fek, fok, tmp;
+        fk = freqdata[k];
+        fnkc.r = freqdata[ncfft - k].r;
+        fnkc.i = -freqdata[ncfft - k].i;
+
+        C_ADD (fek, fk, fnkc);
+        C_SUB (tmp, fk, fnkc);
+        C_MUL (fok,tmp,tw[k]);
+        C_ADD (tmpbuf[k],fek, fok);
+        C_SUB (tmp, fek, fok);
+		tmpbuf[ncfft - k].r = tmp.r;
+        tmpbuf[ncfft - k].i = -tmp.i;
+	}
+    kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+}
+
+#endif
+#endif
+
diff --git a/tmv/lpc_tm.h b/tmv/lpc_tm.h
new file mode 100644
index 0000000..f5e9bcd
--- /dev/null
+++ b/tmv/lpc_tm.h
@@ -0,0 +1,150 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file lpc_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_SPEEX_AUTOCORR
+void _spx_autocorr(const Int16 *x, Int16 *ac, int lag, int n)
+{
+	register int i, j;
+	register int shift, ac_shift;
+	register int n_2;	
+	register int ac0;
+
+	TMDEBUG_ALIGNMEM(x);
+	TMDEBUG_ALIGNMEM(ac);
+
+	_SPX_AUTOCORR_START();
+
+	n_2		= n >> 1;
+	ac0		= n + 1;
+
+#if (TM_UNROLL && TM_UNROLL__SPXAUTOCORR)
+#pragma TCS_unroll=5
+#pragma TCS_unrollexact=1
+#endif
+	for ( j=0 ; j<n_2 ; j+=4 )
+	{	register int x10, x32, x54, x76;
+		
+		x10 = ld32x(x,j);
+		x32 = ld32x(x,j+1);
+		x54 = ld32x(x,j+2);
+		x76 = ld32x(x,j+3);
+
+		ac0 += ifir16(x10, x10) >> 8;
+		ac0	+= ifir16(x32, x32) >> 8;
+		ac0 += ifir16(x54, x54) >> 8;
+		ac0 += ifir16(x76, x76) >> 8;
+	}
+#if (TM_UNROLL && TM_UNROLL__SPXAUTOCORR)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	shift = 8;
+	while (shift && ac0<0x40000000)
+	{	shift--;
+		ac0 <<= 1;
+	}
+
+	ac_shift = 18;
+	while (ac_shift && ac0<0x40000000)
+	{	ac_shift--;
+		ac0 <<= 1;
+	}
+
+	if ( shift == 0 )
+	{
+		for ( i=0 ; i<lag ; ++i )
+		{
+			register int acc0, acc1, acc2;
+			register int k, l, m; 
+			register int x10, x32, y10, y32;
+			
+			acc2 = acc1 = acc0 = 0;
+
+			for ( j=i ; j<16 ; ++j )
+			{	acc0 += (x[j] * x[j-i]);
+			}
+
+			for ( k=16,l=8,m=16-i ; k<n ; k+=4,l+=2,m+=4 )
+			{	
+				x10  = ld32x(x,l);
+				y10  = pack16lsb(x[m+1],x[m]);
+				x32  = ld32x(x,l+1);
+				y32  = pack16lsb(x[m+3],x[m+2]);
+				
+				acc1 += ifir16(x10,y10);
+				acc2 += ifir16(x32,y32);
+			}
+
+			ac[i] = (acc0 + acc1 + acc2) >> ac_shift;
+		}
+	} else
+	{
+		for ( i=0 ; i<lag ; ++i )
+		{
+			register int acc0, acc1, acc2;
+			register int k, l, m; 
+			register int x10, x32, y10, y32;
+			
+			acc2 = acc1 = acc0 = 0;
+
+			for ( j=i ; j<16 ; ++j )
+			{	acc0 += (x[j] * x[j-i]) >> shift;
+			}
+
+			for ( k=16,l=8,m=16-i ; k<n ; k+=4,l+=2,m+=4 )
+			{	
+				x10  = ld32x(x,l);
+				y10  = pack16lsb(x[m+1],x[m]);
+				x32  = ld32x(x,l+1);
+				y32  = pack16lsb(x[m+3],x[m+2]);
+				
+				acc1 += ifir16(x10,y10) >> shift;
+				acc2 += ifir16(x32,y32) >> shift;
+			}
+
+			ac[i] = (acc0 + acc1 + acc2) >> ac_shift;
+		}
+	}
+	
+	_SPX_AUTOCORR_STOP();
+}
+
+#endif
diff --git a/tmv/lsp_tm.h b/tmv/lsp_tm.h
new file mode 100644
index 0000000..acb0d91
--- /dev/null
+++ b/tmv/lsp_tm.h
@@ -0,0 +1,310 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file lsp_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_LSP_INTERPOLATE
+void lsp_interpolate(Int16 *old_lsp, Int16 *new_lsp, Int16 *interp_lsp, int len, int subframe, int nb_subframes)
+{
+	register int tmp  =  DIV32_16(SHL32(EXTEND32(1 + subframe),14),nb_subframes);
+	register int tmp2 =  16384-tmp;
+	register int in_0, in_1, factor, out_1, out_2, olsp, nlsp, ilsp;
+	register int i;
+
+	TMDEBUG_ALIGNMEM(old_lsp);
+	TMDEBUG_ALIGNMEM(new_lsp);
+	TMDEBUG_ALIGNMEM(interp_lsp);
+
+	LSPINTERPOLATE_START();
+
+	factor = pack16lsb(tmp,tmp2);
+
+	len >>= 1;
+	for ( i=0 ; i<len ; ++i )
+	{	
+		olsp = ld32x(old_lsp,i);							// olsp[i+1],olsp[i]
+		nlsp = ld32x(new_lsp,i);							// nlsp[i+1],nlsp[i]
+
+		in_0 = pack16lsb(nlsp,olsp);
+		in_1 = pack16msb(nlsp,olsp);
+	  
+		out_1 = 8192 + ifir16(in_0,factor);
+		out_2 = 8192 + ifir16(in_1,factor);
+
+		ilsp = pack16lsb(out_2 >> 14, out_1 >> 14);
+		st32d(i << 2, interp_lsp, ilsp);
+
+	}
+
+	LSPINTERPOLATE_STOP();
+}
+
+
+#define OVERRIDE_CHEB_POLY_EVA
+static inline Int32 cheb_poly_eva(Int16 *coef, Int16 x, int m, char *stack)
+{
+	register int	c10, c32, c54;
+    register int	sum, b0, f0, f1, f2, f3;
+	register int	xx, f32, f10;
+
+	CHEBPOLYEVA_START();
+
+	xx = sex16(x);
+	b0 = iclipi(xx,16383);
+
+#if 0
+	c10 = ld32(coef);
+	c32 = ld32x(coef,1);
+	c54 = ld32x(coef,2);
+#else	
+	c10 = pack16lsb(coef[1],coef[0]);
+	c32 = pack16lsb(coef[3],coef[2]);
+	c54 = pack16lsb(coef[5],coef[4]);
+#endif
+
+	f0   = ((xx * b0) >> 13) - 16384;
+	f1   = ((xx * f0) >> 13) - b0;
+	f2   = ((xx * f1) >> 13) - f0;
+
+	if ( m == 4 )
+	{	sum = sex16(c54);
+		f32 = pack16lsb(xx,f0);
+		f10 = pack16lsb(f1,f2);
+
+	} else
+	{	sum =  asri(16,c54);
+		sum += ((sex16(c54) * xx) + 8192) >> 14;
+
+		f3   = ((xx * f2) >> 13) - f1;
+		f32 = pack16lsb(f0,f1);
+		f10 = pack16lsb(f2,f3);
+	}
+
+	sum += (ifir16(c32,f32) + 8192) >> 14;
+	sum += (ifir16(c10,f10) + 8192) >> 14;
+
+#ifndef REMARK_ON
+   (void)stack;
+#endif
+
+	CHEBPOLYEVA_STOP();
+    return sum;
+}
+
+
+#define OVERRIDE_LSP_ENFORCE_MARGIN
+void lsp_enforce_margin(Int16 *lsp, int len, Int16 margin)
+{
+	register int i;
+	register int m = margin;
+	register int m2 = 25736-margin;
+	register int lsp0, lsp1, lsp2;
+
+	TMDEBUG_ALIGNMEM(lsp);
+
+	LSPENFORCEMARGIN_START();
+
+	lsp0 = ld32(lsp);
+	lsp1 = asri(16,lsp0);
+	lsp0 = sex16(lsp0);
+	lsp2 = lsp[len-1];
+
+	if ( lsp0 < m )
+	{	lsp0 = m;
+		lsp[0] = m;
+	}
+
+	if ( lsp2 > m2 )
+	{	lsp2 = m2;
+		lsp[len-1] = m2;
+	}
+
+	for ( i=1 ; i<len-1 ; ++i )
+	{	
+		lsp0 += m;
+		lsp2 = lsp[i+1];
+		m2   = lsp2 - m;
+
+		if ( lsp1 < lsp0 )
+		{	lsp1   = lsp0;
+			lsp[i] = lsp0;
+		}
+
+		if ( lsp1 > m2 )
+		{	lsp1	= (lsp1 >> 1) + (m2 >> 1);
+			lsp[i]	= lsp1;			
+		}
+
+		lsp0 = lsp1;
+		lsp1 = lsp2;
+	}
+
+	LSPENFORCEMARGIN_STOP();
+}
+
+
+#define OVERRIDE_LSP_TO_LPC
+void lsp_to_lpc(Int16 *freq, Int16 *ak,int lpcrdr, char *stack)
+{
+    VARDECL(Int16 *freqn);
+    VARDECL(int **xp);
+    VARDECL(int *xpmem);
+    VARDECL(int **xq);
+    VARDECL(int *xqmem);
+
+    register int i, j, k;
+    register int xout1,xout2,xin;
+    register int m;
+    
+	LSPTOLPC_START();
+
+	m = lpcrdr>>1;
+
+	/* 
+    
+       Reconstruct P(z) and Q(z) by cascading second order polynomials
+       in form 1 - 2cos(w)z(-1) + z(-2), where w is the LSP frequency.
+       In the time domain this is:
+
+       y(n) = x(n) - 2cos(w)x(n-1) + x(n-2)
+    
+       This is what the ALLOCS below are trying to do:
+
+         int xp[m+1][lpcrdr+1+2]; // P matrix in QIMP
+         int xq[m+1][lpcrdr+1+2]; // Q matrix in QIMP
+
+       These matrices store the output of each stage on each row.  The
+       final (m-th) row has the output of the final (m-th) cascaded
+       2nd order filter.  The first row is the impulse input to the
+       system (not written as it is known).
+
+       The version below takes advantage of the fact that a lot of the
+       outputs are zero or known, for example if we put an inpulse
+       into the first section the "clock" it 10 times only the first 3
+       outputs samples are non-zero (it's an FIR filter).
+    */
+
+    ALLOC(xp, (m+1), int*);
+    ALLOC(xpmem, (m+1)*(lpcrdr+1+2), int);
+
+    ALLOC(xq, (m+1), int*);
+    ALLOC(xqmem, (m+1)*(lpcrdr+1+2), int);
+    
+    for ( i=0; i<=m; i++ ) 
+	{	xp[i] = xpmem + i*(lpcrdr+1+2);
+		xq[i] = xqmem + i*(lpcrdr+1+2);
+    }
+
+    /* work out 2cos terms in Q14 */
+
+    ALLOC(freqn, lpcrdr, Int16);
+    for ( j=0,k=0 ; j<m ; ++j,k+=2 )
+	{	register int f;
+
+		f = ld32x(freq,j);
+
+		freqn[k] = ANGLE2X(sex16(f));
+		freqn[k+1] = ANGLE2X(asri(16,f));
+	}
+    
+	#define QIMP  21   /* scaling for impulse */
+    xin = SHL32(EXTEND32(1), (QIMP-1)); /* 0.5 in QIMP format */
+   
+    /* first col and last non-zero values of each row are trivial */
+    
+    for(i=0;i<=m;i++) 
+	{	xp[i][1] = 0;
+		xp[i][2] = xin;
+		xp[i][2+2*i] = xin;
+		xq[i][1] = 0;
+		xq[i][2] = xin;
+		xq[i][2+2*i] = xin;
+    }
+
+    /* 2nd row (first output row) is trivial */
+
+    xp[1][3] = -MULT16_32_Q14(freqn[0],xp[0][2]);
+    xq[1][3] = -MULT16_32_Q14(freqn[1],xq[0][2]);
+
+    xout1 = xout2 = 0;
+
+    for( i=1 ; i<m ; ++i) 
+	{	register int f, f0, f1, m0, m1;
+		
+		k  = 2*(i+1)-1;
+		f  = ld32x(freqn,i);
+		f0 = sex16(f);
+		f1 = asri(16,f);
+
+		for( j=1 ; j<k ; ++j) 
+		{	register int _m0, _m1;	
+
+			_m0 = MULT16_32_Q14(f0,xp[i][j+1]);
+			xp[i+1][j+2] = ADD32(SUB32(xp[i][j+2], _m0), xp[i][j]);
+	
+			_m1 = MULT16_32_Q14(f1,xq[i][j+1]);
+			xq[i+1][j+2] = ADD32(SUB32(xq[i][j+2], _m1), xq[i][j]);
+		}
+
+      
+		m0 = MULT16_32_Q14(f0,xp[i][j+1]);
+		xp[i+1][j+2] = SUB32(xp[i][j], m0);
+		m1 = MULT16_32_Q14(f1,xq[i][j+1]);
+		xq[i+1][j+2] = SUB32(xq[i][j], m1);
+    }
+
+
+    for( i=0,j=3 ; i<lpcrdr ; ++j,++i ) 
+	{	register int _a0, _xp0, _xq0;
+
+		_xp0 = xp[m][j];
+		_xq0 = xq[m][j];
+
+      	_a0 = PSHR32(_xp0 + xout1 + _xq0 - xout2, QIMP-13); 
+		xout1 = _xp0;
+		xout2 = _xq0;
+     
+		ak[i] = iclipi(_a0,32767); 
+    }
+
+	LSPTOLPC_STOP();
+}
+
+
+#endif
diff --git a/tmv/ltp_tm.h b/tmv/ltp_tm.h
new file mode 100644
index 0000000..eccd701
--- /dev/null
+++ b/tmv/ltp_tm.h
@@ -0,0 +1,479 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file ltp_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_INNER_PROD
+Int32 inner_prod(const Int16 * restrict x, const Int16 * restrict y, int len)
+{
+	register int sum = 0;
+
+	INNERPROD_START();
+
+	if ( (int)x & 0x03 == 0 && (int)y & 0x03 == 0 )
+	{
+		register int i;
+
+		len >>= 1;
+		for ( i=0 ; i<len ; i+=4 )
+		{
+			register int x0, x1, y0, y1, x2, x3, y2, y3;
+
+			x0 = ld32x(x,i);
+			y0 = ld32x(x,i);
+			x1 = ld32x(x,i+1);
+			y1 = ld32x(y,i+1);
+			sum += (ifir16(x0,y0) + ifir16(x1,y1)) >> 6;
+
+			x2 = ld32x(x,i+2);
+			y2 = ld32x(x,i+2);
+			x3 = ld32x(x,i+3);
+			y3 = ld32x(x,i+3);
+			sum += (ifir16(x2,y2) + ifir16(x3,y3)) >> 6;
+
+		}
+	} else
+	{
+		len >>= 3;
+		while( len-- )
+		{
+			register int x0, x1, x2, x3, y0, y1, y2, y3;
+
+			x0 =	pack16lsb(x[0],x[1]);
+			y0 =	pack16lsb(y[0],y[1]);
+			x1 =	pack16lsb(x[2],x[3]);
+			y1 =	pack16lsb(y[2],y[3]);
+			sum	+=	(ifir16(x0,y0) + ifir16(x1,y1)) >> 6;
+
+			x2 =	pack16lsb(x[4],x[5]);
+			y2 =	pack16lsb(y[4],y[5]);
+			x3 =	pack16lsb(x[6],x[7]);
+			y3 =	pack16lsb(y[6],y[7]);
+			sum	+=	(ifir16(x2,y2) + ifir16(x3,y3)) >> 6;
+
+			x += 8;
+			y += 8;
+		}
+	}
+
+	INNERPROD_STOP();
+	return sum;
+}
+
+#define OVERRIDE_PITCH_XCORR
+void pitch_xcorr(const Int16 *_x, const Int16 *_y, Int32 *corr, int len, int nb_pitch, char *stack)
+{
+	register int sum_1, sum_2, sum_3, sum_4;
+	register int y10, y32, y54, y76, y21, y43, y65;
+	register int x10, x32;
+	register int i, j, k, limit;
+	
+	TMDEBUG_ALIGNMEM(_x);
+	TMDEBUG_ALIGNMEM(_y);
+
+	PITCHXCORR_START();
+
+	limit	= nb_pitch >> 1;
+	len		>>= 1;
+
+	for (i=0 ; i<limit ; i+=2 )
+	{
+		sum_1 = sum_2 = sum_3 = sum_4 = 0;
+		
+		y10	= ld32x(_y,i);
+		y32	= ld32x(_y,i+1);
+		
+		for ( j=0 ; j<len ; j+=2 )
+		{
+			x10 = ld32x(_x,j);
+			x32 = ld32x(_x,j+1);
+			y54 = ld32x(_y,i+j+2);
+			y76 = ld32x(_y,i+j+3);
+			
+			sum_1 += (ifir16(x10,y10) + ifir16(x32,y32)) >> 6;
+			sum_3 += (ifir16(x10,y32) + ifir16(x32,y54)) >> 6;
+
+			y21 = funshift2(y32,y10);
+			y43 = funshift2(y54,y32);
+			y65 = funshift2(y76,y54);
+
+			sum_2 += (ifir16(x10,y21) + ifir16(x32,y43)) >> 6;
+			sum_4 += (ifir16(x10,y43) + ifir16(x32,y65)) >> 6;
+
+			y10 = y54;
+			y32 = y76;
+
+		}
+
+		k = i << 1;
+		corr[nb_pitch-1-k]=sum_1;
+		corr[nb_pitch-2-k]=sum_2;
+		corr[nb_pitch-3-k]=sum_3;
+		corr[nb_pitch-4-k]=sum_4;
+	}
+
+#ifndef REMARK_ON
+	(void)stack;
+#endif
+
+	PITCHXCORR_STOP();
+}
+
+#ifndef ttisim
+#define OVERRIDE_PITCH_GAIN_SEARCH_3TAP_VQ
+static int pitch_gain_search_3tap_vq
+(
+	const signed char	*gain_cdbk,
+	int					gain_cdbk_size,
+	Int16				*C16,
+	Int16				max_gain
+)
+{
+	register int	pp = 0x00400040, p=64;
+	register int	g10, g2, g20, g21, g02, g22, g01;
+	register int	cb0, cb1, cb2, cb5432;
+	register int	C10, C32, C54, C76, C98, C83, C2;
+	register int	acc0, acc1, acc2, acc3, sum, gsum, bsum=-VERY_LARGE32;
+	register int    i, best_cdbk=0;
+	register Int16	tmp;
+
+	TMDEBUG_ALIGNMEM(C16);
+	TMDEBUG_ALIGNMEM(gain_cdbk+2);
+
+	PITCHGAINSEARCH3TAPVQ_START();
+
+	tmp  = ild16(gain_cdbk);
+	C98	 = ld32x(C16,4);
+	C32	 = ld32x(C16,1);
+	C10  = ld32(C16);
+	C54	 = ld32x(C16,2);
+	C76	 = ld32x(C16,3);
+
+	cb0  = sex8(tmp);
+	cb1	 = sex8(tmp>>8);
+	C83	 = funshift2(C98,C32);
+	C2	 = sex16(C32);
+	gain_cdbk += 2;
+
+
+#if (TM_UNROLL && TM_UNROLL_PITCHGAINSEARCH3TAPVQ > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<gain_cdbk_size ; ++i ) 
+	{
+         cb5432 = ld32x(gain_cdbk,i);
+		 cb2	= sex8(cb5432);
+		 gsum	= sex8(cb5432>>8);
+		 sum	= 0;
+		 
+		 g10	=  pack16lsb(cb1 + 32, cb0 + 32);
+		 g2		=  cb2 + 32;
+		 g02	=  pack16lsb(g10, g2);
+		 acc0	=  dspidualmul(g10,pp);
+		 sum	+= ifir16(acc0,C10);
+		 sum	+= p * g2 * C2;
+
+		 g22	=  pack16lsb(g02, g02);
+		 g01	=  funshift2(g10, g10);
+
+		 acc1	=  dspidualmul(g22, g01);
+		 sum	-= ifir16(acc1, C54);
+		 acc2	=  dspidualmul(g10, g10);
+		 sum	-= ifir16(acc2, C76);
+
+		 g20	=  pack16lsb(g2, g10);
+		 g21	=  funshift2(g2, g10);
+		 acc3	=  dspidualmul(g20, g21);
+		 sum	-= ifir16(acc3, C83);
+	
+
+		if ( sum>bsum && gsum<=max_gain ) 
+		{	bsum = sum;
+			best_cdbk=i;
+		}
+
+		cb0	= sex8(cb5432 >> 16);
+		cb1	= asri(24,cb5432);
+	}
+#if (TM_UNROLL && TM_UNROLL_PITCHGAINSEARCH3TAPVQ > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	PITCHGAINSEARCH3TAPVQ_STOP();
+	return best_cdbk;
+}
+#endif
+
+#define OVERRIDE_COMPUTE_PITCH_ERROR
+#ifndef OVERRIDE_PITCH_GAIN_SEARCH_3TAP_VQ
+inline Int32 compute_pitch_error(Int16 *C, Int16 *g, Int16 pitch_control)
+{
+	register int c10, c32, c54, c76, c98, c83;
+	register int g10, g32, g02, g22, g01, g21, g20;
+	register int pp, tmp0, tmp1, tmp2, tmp3;
+	register int sum = 0;
+
+	
+	COMPUTEPITCHERROR_START();
+
+	g10  =  ld32(g);
+	g32  =  ld32x(g,1);
+	pp   =  pack16lsb(pitch_control,pitch_control);
+	c10  =  ld32(C);
+	c32  =  ld32x(C,1);
+	g02  =  pack16lsb(g10,g32);
+	g22	 =  pack16lsb(g32,g32);
+	g01  =  funshift2(g10,g10);
+	tmp0 =  dspidualmul(g10,pp);
+	sum  += ifir16(tmp0, c10);
+	sum  += pitch_control * sex16(g32) * sex16(c32);
+	c54  =  ld32x(C,2);
+	c76  =  ld32x(C,3);
+	c98  =  ld32x(C,4);
+	tmp1 =  dspidualmul(g22,g01);
+	sum  -= ifir16(tmp1, c54);
+	tmp2 =  dspidualmul(g10,g10);
+	sum  -= ifir16(tmp2,c76);
+	c83  =  funshift2(c98,c32);
+	g20	 =  funshift2(g02,g02);
+	g21  =  funshift2(g02,g10);
+	tmp3 =  dspidualmul(g20,g21);
+	sum	 -= ifir16(tmp3,c83);
+
+	COMPUTEPITCHERROR_STOP();
+    return sum;
+}
+#endif
+
+#define OVERRIDE_OPEN_LOOP_NBEST_PITCH
+void open_loop_nbest_pitch(Int16 *sw, int start, int end, int len, int *pitch, Int16 *gain, int N, char *stack)
+{
+	VARDECL(int *best_score);
+	VARDECL(int *best_ener);
+	VARDECL(Int32 *corr);
+	VARDECL(Int16 *corr16);
+	VARDECL(Int16 *ener16);
+	register int i, j, k, l, N4, N2;
+	register int _sw10, _sw32, _s0, _s2, limit;
+	register int *energy;
+	register int cshift=0, eshift=0;
+	register int scaledown = 0;
+	register int e0, _energy0;
+
+	ALLOC(corr16, end-start+1, Int16);
+	ALLOC(ener16, end-start+1, Int16);
+	ALLOC(corr, end-start+1, Int32);
+	ALLOC(best_score, N, int);
+	ALLOC(best_ener, N, int);
+	energy = corr;
+	N4 = N << 2;
+	N2 = N >> 1;
+
+	TMDEBUG_ALIGNMEM(sw);
+	TMDEBUG_ALIGNMEM(pitch);
+	TMDEBUG_ALIGNMEM(gain);
+	TMDEBUG_ALIGNMEM(best_score);
+	TMDEBUG_ALIGNMEM(best_ener);
+	TMDEBUG_ALIGNMEM(corr16);
+	TMDEBUG_ALIGNMEM(ener16);
+
+	OPENLOOPNBESTPITCH_START();
+
+	for ( i=0 ; i<N4 ; i+=4 )
+	{	st32d(i,best_score,-1);
+		st32d(i,best_ener,0);
+		st32d(i,pitch,start);
+	}
+
+	for ( j=asri(1,-end) ; j<N2 ; ++j )
+	{	register int _sw10;
+
+		_sw10 = ld32x(sw,j);
+		_sw10 = dspidualabs(_sw10);
+
+		if ( _sw10 & 0xC000C000 )
+		{	scaledown = 1;
+			break;
+		}
+	}
+
+	if ( scaledown )
+	{
+		for ( j=asri(1,-end),k=asli(1,-end) ; j<N2 ; ++j,k+=4 )
+		{	register int _sw10;
+		
+			_sw10 = ld32x(sw,j);
+			_sw10 = dualasr(_sw10,1);
+			st32d(k, sw, _sw10);
+		}
+	}      
+
+	energy[0] = _energy0 = inner_prod(sw-start, sw-start, len);
+	e0 = inner_prod(sw, sw, len);
+
+	j=asri(1,-start-1); k=j+20;
+	_sw10 = ld32x(sw,j);
+	_sw32 = ld32x(sw,k);
+	limit = end-1-start;
+
+	for ( i=1,--j,--k ; i<limit ; i+=2,--j,--k )
+	{	register int _energy1, __sw10, __sw32, __s0, __s2;
+      
+		_s0    = sex16(_sw10);
+		_s2	   = sex16(_sw32);
+		_energy1 = (_energy0 + ((_s0 * _s0) >> 6)) -  ((_s2 * _s2) >> 6);
+		_energy0 = imax(0,_energy1);
+		energy[i] = _energy0;
+		__sw10 = ld32x(sw,j);
+		__sw32 = ld32x(sw,k);
+		__s0   = asri(16,__sw10);
+		__s2   = asri(16,__sw32);
+		_energy1 = (_energy0 + ((__s0 * __s0) >> 6)) -  ((__s2 * __s2) >> 6);
+		_energy0 = imax(0,_energy1);
+		energy[i+1] = _energy0;
+		_sw10 = __sw10;
+		_sw32 = __sw32;
+	}
+
+	_s0    = sex16(_sw10);
+	_s2	   = sex16(_sw32);
+	_energy0 = imax(0,(_energy0 + ((_s0 * _s0) >> 6)) -  ((_s2 * _s2) >> 6));
+	energy[i] = _energy0;
+
+
+	eshift = normalize16(energy, ener16, 32766, end-start+1); 
+	/* In fixed-point, this actually overrites the energy array (aliased to corr) */
+	pitch_xcorr(sw, sw-end, corr, len, end-start+1, stack);
+	/* Normalize to 180 so we can square it and it still fits in 16 bits */
+	cshift = normalize16(corr, corr16, 180, end-start+1);
+	/* If we scaled weighted input down, we need to scale it up again (OK, so we've just lost the LSB, who cares?) */
+   
+	if ( scaledown )
+	{
+		for ( j=asri(1,-end),k=asli(1,-end) ; j<N2 ; ++j,k+=4 )
+		{	register int _sw10;
+			
+			_sw10 = ld32x(sw,j);
+			_sw10 = dualasl(_sw10,1);
+			st32d(k, sw, _sw10);
+		}
+	}      
+
+	/* Search for the best pitch prediction gain */
+	for ( i=start,l=0 ; i<end ; i+=2,++l )
+	{	register int _corr16, _c0, _c1;
+		register int _ener16, _e0, _e1;
+
+		_corr16 = ld32x(corr16,l);
+		_corr16 = dspidualmul(_corr16,_corr16);
+		_c0     = sex16(_corr16);
+		_c1     = asri(16,_corr16);
+
+		_ener16 = ld32x(ener16,l);
+		_ener16 = dspidualadd(_ener16,0x00010001);
+		_e0	    = sex16(_ener16);
+		_e1     = asri(16,_ener16);
+
+      /* Instead of dividing the tmp by the energy, we multiply on the other side */
+      
+		if ( (_c0 * best_ener[N-1]) > (best_score[N-1] * _e0) )
+		{	
+			best_score[N-1] = _c0;
+			best_ener[N-1] = _e0;
+			pitch[N-1] = i;
+
+			for( j=0 ; j<N-1 ; ++j )
+			{	if ( (_c0 * best_ener[j]) > best_score[j] * _e0 )
+				{	for( k=N-1 ; k>j ; --k )
+					{
+						best_score[k]=best_score[k-1];
+						best_ener[k]=best_ener[k-1];
+						pitch[k]=pitch[k-1];
+					}
+
+			        best_score[j]=_c0;
+					best_ener[j]=_e0;
+					pitch[j]=i;
+					break;
+				}
+			}
+		}
+
+		if ( (_c1 * best_ener[N-1]) > (best_score[N-1] * _e1) )
+		{	
+			best_score[N-1] = _c1;
+			best_ener[N-1] = _e1;
+			pitch[N-1] = i+1;
+
+			for( j=0 ; j<N-1 ; ++j )
+			{	if ( (_c1 * best_ener[j]) > best_score[j] * _e1 )
+				{	for( k=N-1 ; k>j ; --k )
+					{
+						best_score[k]=best_score[k-1];
+						best_ener[k]=best_ener[k-1];
+						pitch[k]=pitch[k-1];
+					}
+
+			        best_score[j]=_c1;
+					best_ener[j]=_e1;
+					pitch[j]=i+1;
+					break;
+				}
+			}
+		}
+   }
+   
+   /* Compute open-loop gain if necessary */
+   if (gain)
+   {
+		for (j=0;j<N;j++)
+		{
+			spx_word16_t g;
+			i=pitch[j];
+			g = DIV32(SHL32(EXTEND32(corr16[i-start]),cshift), 10+SHR32(MULT16_16(spx_sqrt(e0),spx_sqrt(SHL32(EXTEND32(ener16[i-start]),eshift))),6));
+		 	gain[j] = imax(0,g);
+		}
+	}
+
+	OPENLOOPNBESTPITCH_STOP();
+}
+
+
+#endif
+
diff --git a/tmv/mdf_tm.h b/tmv/mdf_tm.h
new file mode 100644
index 0000000..cc82bf8
--- /dev/null
+++ b/tmv/mdf_tm.h
@@ -0,0 +1,1192 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file mdf_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+// shifted power spectrum to fftwrap.c so that optimisation can be shared between mdf.c and preprocess.c
+#define OVERRIDE_POWER_SPECTRUM					
+
+#ifdef FIXED_POINT
+
+#else
+
+#define OVERRIDE_FILTER_DC_NOTCH16
+void filter_dc_notch16(
+	const spx_int16_t * restrict in, 
+	float radius, 
+	float * restrict out, 
+	int len, 
+	float * restrict mem
+)
+{
+	register int i;
+	register float den2, r1;
+	register float mem0, mem1;
+
+	FILTERDCNOTCH16_START();
+
+	r1 = 1 - radius;
+	den2 = (radius * radius) + (0.7 * r1 * r1);
+	mem0 = mem[0];
+	mem1 = mem[1];
+
+#if (TM_UNROLL && TM_UNROLL_FILTERDCNOTCH16)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<len ; ++i )
+	{
+		register float vin  = in[i];
+		register float vout = mem0 + vin;
+		register float rvout = radius * vout;
+
+		mem0 = mem1 + 2 * (-vin + rvout);
+		mem1 = vin - (den2 * vout);
+		
+		out[i] = rvout;
+	}
+#if (TM_UNROLL && TM_UNROLL_FILTERDCNOTCH16)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	mem[0] = mem0;
+	mem[1] = mem1;
+
+	FILTERDCNOTCH16_STOP();
+}
+
+#define OVERRIDE_MDF_INNER_PROD
+float mdf_inner_prod(
+	const float * restrict x, 
+	const float * restrict y, 
+	int len
+)
+{
+	register float sum = 0;
+   
+	MDFINNERPROD_START();
+
+	len >>= 1;
+  
+#if (TM_UNROLL && TM_UNROLL_MDFINNERPRODUCT)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	while(len--)
+	{
+		register float acc0, acc1;
+
+		acc0 = (*x++) * (*y++);
+		acc1 = (*x++) * (*y++);
+
+		sum	 += acc0 + acc1;
+	}
+#if (TM_UNROLL && TM_UNROLL_MDFINNERPRODUCT)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	MDFINNERPROD_STOP();
+
+	return sum;
+}
+
+#define OVERRIDE_SPECTRAL_MUL_ACCUM
+void spectral_mul_accum(
+	const float * restrict X, 
+	const float * restrict Y, 
+	float * restrict acc, 
+	int N, int M
+)
+{
+	register int i, j;
+	register float Xi, Yi, Xii, Yii;
+	register int _N;
+
+	SPECTRALMULACCUM_START();
+
+	acc[0] = X[0] * Y[0];
+	_N = N-1;
+
+	for ( i=1 ; i<_N ; i+=2 )
+	{
+		Xi = X[i];
+		Yi = Y[i];
+		Xii = X[i+1];
+		Yii = Y[i+1];
+
+		acc[i]	= (Xi  * Yi - Xii * Yii);
+		acc[i+1]= (Xii * Yi + Xi  * Yii);
+	}
+      
+	acc[_N] = X[_N] * Y[_N];
+
+	for ( j=1,X+=N,Y+=N ; j<M ; j++ )
+	{	
+		acc[0] += X[0] * Y[0];
+
+		for ( i=1 ; i<N-1 ; i+=2 )
+		{
+			Xi = X[i];
+			Yi = Y[i];
+			Xii = X[i+1];
+			Yii = Y[i+1];
+
+			acc[i]	+= (Xi  * Yi - Xii * Yii);
+			acc[i+1]+= (Xii * Yi + Xi  * Yii);
+		}
+      
+		acc[_N] += X[_N] * Y[_N];
+		X += N;
+		Y += N;
+	}
+
+	SPECTRALMULACCUM_STOP();
+}
+
+#define OVERRIDE_WEIGHTED_SPECTRAL_MUL_CONJ
+void weighted_spectral_mul_conj(
+	const float * restrict w, 
+	const float p, 
+	const float * restrict X, 
+	const float * restrict Y, 
+	float * restrict prod, 
+	int N
+)
+{
+	register int i, j;
+   	register int _N;
+
+	WEIGHTEDSPECTRALMULCONJ_START();
+
+	prod[0] = p * w[0] * X[0] * Y[0]; 
+	_N = N-1;
+
+	for (i=1,j=1;i<_N;i+=2,j++)
+	{
+		register float W;
+		register float Xi, Yi, Xii, Yii;
+
+		Xi = X[i];
+		Yi = Y[i];
+		Xii = X[i+1];
+		Yii = Y[i+1];
+		W = p * w[j];
+
+
+		prod[i]	 = W * (Xi  * Yi + Xii * Yii);
+		prod[i+1]= W * (Xi  * Yii - Xii * Yi);
+	}
+   
+	prod[_N] = p * w[j] * X[_N] * Y[_N];
+
+	WEIGHTEDSPECTRALMULCONJ_STOP();
+}
+
+#define OVERRIDE_MDF_ADJUST_PROP
+void mdf_adjust_prop(
+	const float * restrict W, 
+	int N, 
+	int M, 
+	float * restrict prop
+)
+{
+	register int i, j;
+	register float max_sum = 1;
+	register float prop_sum = 1;
+
+	MDFADJUSTPROP_START();
+
+	for ( i=0 ; i<M ; ++i )
+	{
+		register float tmp = 1;
+		register int k = i * N;
+		register int l = k + N;
+		register float propi;
+
+#if (TM_UNROLL && TM_UNROLL_MDFADJUSTPROP)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( j=k ; j<l ; ++j )
+		{	
+			register float wi = W[j];
+			
+			tmp += wi * wi;
+		}
+#if (TM_UNROLL && TM_UNROLL_MDFADJUSTPROP)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+		propi  = spx_sqrt(tmp);
+		prop[i]= propi;
+		max_sum= fmux(propi > max_sum, propi, max_sum);
+	}
+
+	for ( i=0 ; i<M ; ++i )
+	{
+		register float propi = prop[i];
+
+		propi += .1f * max_sum;
+		prop_sum += propi;
+		prop[i] = propi;
+	}
+
+	prop_sum = 0.99f / prop_sum;
+	for ( i=0 ; i<M ; ++i )
+	{	prop[i] = prop_sum * prop[i];
+	}
+
+	MDFADJUSTPROP_STOP();
+}
+
+
+#define OVERRIDE_SPEEX_ECHO_GET_RESIDUAL
+void speex_echo_get_residual(
+	SpeexEchoState * restrict st, 
+	float * restrict residual_echo, 
+	int len
+)
+{
+	register int i;
+	register float leak2, leake;
+	register int N;
+	register float * restrict window;
+	register float * restrict last_y;
+	register float * restrict y;
+
+	SPEEXECHOGETRESIDUAL_START();
+
+	window = st->window;
+	last_y = st->last_y;
+	y = st->y;
+	N = st->window_size;
+
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOGETRESIDUAL)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for (i=0;i<N;i++)
+	{	y[i] = window[i] * last_y[i];	
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOGETRESIDUAL)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+      
+	spx_fft(st->fft_table, st->y, st->Y);
+	power_spectrum(st->Y, residual_echo, N);
+      
+	leake = st->leak_estimate;
+	leak2 = fmux(leake > .5, 1, 2 * leake);
+	N = st->frame_size;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOGETRESIDUAL)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<N ; ++i )
+	{	residual_echo[i] *= leak2;
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOGETRESIDUAL)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	residual_echo[N] *= leak2;
+
+#ifndef NO_REMARK
+   (void)len;
+#endif
+
+   SPEEXECHOGETRESIDUAL_STOP();
+}
+#endif
+
+
+void mdf_preemph(
+	SpeexEchoState * restrict st, 
+	spx_word16_t * restrict x,
+	const spx_int16_t * restrict far_end,
+	int framesize
+)
+{
+	register spx_word16_t preemph = st->preemph;
+	register spx_word16_t memX = st->memX;
+	register spx_word16_t memD = st->memD;
+	register spx_word16_t * restrict input = st->input;
+	register int i;
+#ifdef FIXED_POINT
+	register int saturated = st->saturated;
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<framesize ; ++i )
+	{
+		register spx_int16_t far_endi = far_end[i];
+		register spx_word32_t tmp32;
+		register spx_word16_t inputi = input[i];
+
+		tmp32 = SUB32(EXTEND32(far_endi), EXTEND32(MULT16_16_P15(preemph,memX)));
+
+#ifdef FIXED_POINT
+		saturated = mux(iabs(tmp32) > 32767, M+1, saturated);
+		tmp32 = iclipi(tmp32,32767);
+#endif
+
+		x[i] = EXTRACT16(tmp32);
+		memX = far_endi;
+		tmp32 = SUB32(EXTEND32(inputi), EXTEND32(MULT16_16_P15(preemph, memD)));
+
+#ifdef FIXED_POINT
+		saturated = mux( ((tmp32 > 32767) && (saturated == 0)), 1,
+					mux( ((tmp32 <-32767) && (saturated == 0)), 1, saturated ));
+		tmp32 = iclipi(tmp32,32767);
+#endif
+		memD = inputi;
+		input[i] = tmp32;
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	st->memD = memD;
+	st->memX = memX;
+
+#ifdef FIXED_POINT
+	st->saturated = saturated;
+#endif
+}
+
+void mdf_sub(
+	spx_word16_t * restrict dest,
+	const spx_word16_t * restrict src1,
+	const spx_word16_t * restrict src2,
+	int framesize
+)
+{
+	register int i;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+#ifdef FIXED_POINT
+	for ( i=0,framesize<<=1 ; i<framesize ; i+=4 )
+	{	register int src1i, src2i, desti;
+
+		src1i = ld32d(src1,i);
+		src2i = ld32d(src2,i);
+		desti = dspidualsub(src1i,src2i);
+		st32d(i, dest, desti);
+	}
+#else
+	for ( i=0 ; i<framesize ; ++i )
+	{	dest[i] = src1[i] - src2[i];
+	}
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif   
+}
+
+void mdf_sub_int(
+	spx_word16_t * restrict dest,
+	const spx_int16_t * restrict src1,
+	const spx_int16_t * restrict src2,
+	int framesize
+)
+{
+	register int i, j;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+#ifdef FIXED_POINT
+	for ( i=0,framesize<<=1 ; i<framesize ; i+=4 )
+	{	register int src1i, src2i, desti;
+
+		src1i = ld32d(src1,i);
+		src2i = ld32d(src2,i);
+		desti = dspidualsub(src1i,src2i);
+		st32d(i, dest, desti);
+	}
+#else
+	for ( i=0,j=0 ; i<framesize ; i+=2,++j )
+	{	register int src1i, src2i, desti;
+		
+		
+		src1i = ld32d(src1,j);
+		src2i = ld32d(src2,j);
+		desti = dspidualsub(src1i,src2i);
+
+		dest[i] = sex16(desti);
+		dest[i+1] = asri(16,desti);
+	}
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif   
+}
+
+void mdf_compute_weight_gradient(
+	SpeexEchoState * restrict st,
+	spx_word16_t * restrict X,
+	int N,
+	int M
+)
+{
+	register int i, j;
+	register spx_word32_t * restrict PHI = st->PHI;
+
+	for (j=M-1;j>=0;j--)
+	{
+		register spx_word32_t * restrict W = &(st->W[j*N]);
+
+		weighted_spectral_mul_conj(
+			st->power_1, 
+			FLOAT_SHL(PSEUDOFLOAT(st->prop[j]),-15), 
+			&X[(j+1)*N], 
+			st->E, 
+			st->PHI, 
+			N);
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for (i=0;i<N;i++)
+		{	W[i] = ADD32(W[i],PHI[i]);
+		}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif         
+	}
+}
+
+void mdf_update_weight(
+	SpeexEchoState * restrict st,
+	int N,
+	int M,
+	int framesize
+)
+{
+	register int j;
+	register int cancel_count = st->cancel_count;
+	register spx_word16_t * restrict wtmp = st->wtmp;
+#ifdef FIXED_POINT
+	register spx_word16_t * restrict wtmp2 = st->wtmp2;
+	register int i;
+#endif
+
+	for ( j=0 ; j<M ; j++ )
+	{
+		register spx_word32_t * restrict W = &(st->W[j*N]);
+    
+		if (j==0 || cancel_count%(M-1) == j-1)
+		{
+#ifdef FIXED_POINT
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+			for ( i=0 ; i<N ; i++ )
+				wtmp2[i] = EXTRACT16(PSHR32(W[i],NORMALIZE_SCALEDOWN+16));
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif         
+			spx_ifft(st->fft_table, wtmp2, wtmp);
+			memset(wtmp, 0, framesize * sizeof(spx_word16_t));
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+			for (j=framesize; j<N ; ++j)
+			{	wtmp[j]=SHL16(wtmp[j],NORMALIZE_SCALEUP);
+			}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif    
+
+			spx_fft(st->fft_table, wtmp, wtmp2);
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+			for (i=0;i<N;i++)
+			{	W[i] -= SHL32(EXTEND32(wtmp2[i]),16+NORMALIZE_SCALEDOWN-NORMALIZE_SCALEUP-1);
+			}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif    
+
+#else
+			spx_ifft(st->fft_table, W, wtmp);
+			memset(&wtmp[framesize], 0, (N-framesize) * sizeof(spx_word16_t));
+			spx_fft(st->fft_table, wtmp, W);
+#endif  
+		}
+	}
+}
+
+#ifdef TWO_PATH
+// first four parameters is passed by registers
+// generate faster performance with 4 parameters functions
+spx_word32_t mdf_update_foreground(
+	SpeexEchoState * restrict st,
+	spx_word32_t Dbf,
+	spx_word32_t Sff,
+	spx_word32_t See
+)										
+{								
+	register spx_word32_t Davg1 = st->Davg1;																	
+	register spx_word32_t Davg2 = st->Davg2;																	
+	register spx_word32_t Dvar1 = st->Dvar1;																	
+	register spx_word32_t Dvar2 = st->Dvar2;
+	register spx_word16_t * restrict input = st->input;
+   	register int framesize = st->frame_size;
+	register spx_word16_t * restrict xx = st->x + framesize;
+	register spx_word16_t * restrict y = st->y + framesize;
+	register spx_word16_t * restrict ee = st->e + framesize;
+	register int update_foreground;
+	register int i;
+    register int N = st->window_size;
+	register int M = st->M;
+																			
+#ifdef FIXED_POINT																								
+	register spx_word32_t sc0 = SUB32(Sff,See);																	
+	register spx_float_t sc1 = FLOAT_MUL32U(Sff,Dbf);															
+																												
+	Davg1 = ADD32(MULT16_32_Q15(QCONST16(.6f,15),Davg1), MULT16_32_Q15(QCONST16(.4f,15),sc0));					
+	Davg2 = ADD32(MULT16_32_Q15(QCONST16(.85f,15),Davg2), MULT16_32_Q15(QCONST16(.15f,15),sc0));				
+	Dvar1 = FLOAT_ADD(																							
+				FLOAT_MULT(VAR1_SMOOTH,Dvar1),																	
+				FLOAT_MUL32U(MULT16_32_Q15(QCONST16(.4f,15),Sff),												
+				MULT16_32_Q15(QCONST16(.4f,15),Dbf)));															
+	Dvar2 = FLOAT_ADD(																							
+				FLOAT_MULT(VAR2_SMOOTH,Dvar2),																	
+				FLOAT_MUL32U(MULT16_32_Q15(QCONST16(.15f,15),Sff),												
+				MULT16_32_Q15(QCONST16(.15f,15),Dbf)));															
+#else
+	register spx_word32_t sc0 = Sff - See;																		
+	register spx_word32_t sc1 = Sff * Dbf;																		
+																												
+	Davg1 = .6*Davg1 + .4*sc0;																					
+	Davg2 = .85*Davg2 + .15*sc0;																				
+	Dvar1 = VAR1_SMOOTH*Dvar1 + .16*sc1;																		
+	Dvar2 = VAR2_SMOOTH*Dvar2 + .0225*sc1;																		
+#endif
+																												
+	update_foreground =																							
+	   mux( FLOAT_GT(FLOAT_MUL32U(sc0, VABS(sc0)), sc1), 1,														
+	   mux( FLOAT_GT(FLOAT_MUL32U(Davg1, VABS(Davg1)), FLOAT_MULT(VAR1_UPDATE,(Dvar1))), 1,					
+	   mux( FLOAT_GT(FLOAT_MUL32U(Davg2, VABS(Davg2)), FLOAT_MULT(VAR2_UPDATE,(Dvar2))), 1, 0)));				
+
+	if ( update_foreground )																					
+	{																											
+		register spx_word16_t * restrict windowf = st->window + framesize;										
+		register spx_word16_t * restrict window = st->window;													
+																												
+		st->Davg1 = st->Davg2 = 0;																				
+		st->Dvar1 = st->Dvar2 = FLOAT_ZERO;																		
+																												
+		memcpy(st->foreground, st->W, N*M*sizeof(spx_word32_t));												
+																												
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)																
+#pragma TCS_unroll=4																							
+#pragma TCS_unrollexact=1																						
+#endif																											
+		for ( i=0 ; i<framesize ; ++i)																			
+		{	register spx_word16_t wi = window[i];																
+			register spx_word16_t wfi = windowf[i];																
+			register spx_word16_t ei = ee[i];																	
+			register spx_word16_t yi = y[i];																	
+																												
+			ee[i] = MULT16_16_Q15(wfi,ei) +	MULT16_16_Q15(wi,yi);												
+		}																										
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)																
+#pragma TCS_unrollexact=0																						
+#pragma TCS_unroll=0																							
+#endif																											
+																												
+	} else																										
+	{																											
+		register int reset_background;																			
+																												
+		reset_background =																						
+			mux( FLOAT_GT(FLOAT_MUL32U(-(sc0),VABS(sc0)), FLOAT_MULT(VAR_BACKTRACK,sc1)), 1,				
+			mux( FLOAT_GT(FLOAT_MUL32U(-(Davg1), VABS(Davg1)), FLOAT_MULT(VAR_BACKTRACK,Dvar1)), 1,			
+			mux( FLOAT_GT(FLOAT_MUL32U(-(Davg2), VABS(Davg2)), FLOAT_MULT(VAR_BACKTRACK,Dvar2)), 1, 0)));	
+																												
+		if ( reset_background )																					
+		{																										
+			memcpy(st->W, st->foreground, N*M*sizeof(spx_word32_t));											
+			memcpy(y, ee, framesize * sizeof(spx_word16_t));													
+			mdf_sub(xx,input,y,framesize);																		
+			See = Sff;																							
+			st->Davg1 = st->Davg2 = 0;																			
+			st->Dvar1 = st->Dvar2 = FLOAT_ZERO;																	
+		} else
+		{
+			st->Davg1 = Davg1;
+			st->Davg2 = Davg2;
+			st->Dvar1 = Dvar1;
+			st->Dvar2 = Dvar2;
+		}
+	}
+
+	return See;
+}	
+#endif
+																				   
+void mdf_compute_error_signal(
+	SpeexEchoState * restrict st,
+	const spx_int16_t * restrict in,
+	spx_int16_t * restrict out,
+	int framesize
+)
+{
+	register spx_word16_t preemph = st->preemph;
+	register spx_word16_t memE = st->memE;
+	register int saturated = st->saturated;
+	register spx_word16_t * restrict e = st->e;
+	register spx_word16_t * restrict ee = st->e + framesize;
+	register spx_word16_t * restrict input = st->input;
+	register spx_word16_t * restrict xx = st->x + framesize;
+	register int i;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<framesize ; ++i )
+	{
+		register spx_word32_t tmp_out;
+		register spx_int16_t ini = in[i];
+		register int flg;
+
+#ifdef FIXED_POINT
+
+#ifdef TWO_PATH
+		tmp_out = SUB32(EXTEND32(input[i]), EXTEND32(ee[i]));
+		tmp_out = iclipi(tmp_out,32767);
+#else
+		tmp_out = SUB32(EXTEND32(input[i]), EXTEND32(y[i]));
+		tmp_out = iclipi(tmp_out,32767);
+#endif
+
+#else
+#ifdef TWO_PATH
+		tmp_out = SUB32(EXTEND32(input[i]), EXTEND32(ee[i]));
+#else
+		tmp_out = SUB32(EXTEND32(input[i]), EXTEND32(y[i]));
+#endif
+		tmp_out = 
+			fmux( tmp_out > 32767, 32767,
+			fmux( tmp_out < -32768, -32768, tmp_out));
+#endif
+
+		tmp_out = ADD32(tmp_out, EXTEND32(MULT16_16_P15(preemph,memE)));
+		flg = iabs(ini) >= 32000;
+		tmp_out = VMUX( flg, 0, tmp_out);
+		saturated = mux( flg && (saturated == 0), 1, saturated);
+
+		out[i] = (spx_int16_t)tmp_out;
+		memE = tmp_out;
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif 
+
+	st->memE = memE;
+	st->saturated = saturated;
+	memset(e, 0, framesize * sizeof(spx_word16_t));
+	memcpy(ee, xx, framesize * sizeof(spx_word16_t));
+}
+
+inline int mdf_check(
+  	SpeexEchoState * restrict st,
+	spx_int16_t * out,
+	spx_word32_t Syy, 
+	spx_word32_t Sxx, 
+	spx_word32_t See,
+	spx_word32_t Sff,
+	spx_word32_t Sdd
+)
+{
+    register int N = st->window_size;
+	register spx_word32_t N1e9 = N * 1e9;
+	register int screwed_up = st->screwed_up;
+	register int framesize = st->frame_size;
+
+	if (!(Syy>=0 && Sxx>=0 && See >= 0)
+#ifndef FIXED_POINT
+		|| !(Sff < N1e9 && Syy < N1e9 && Sxx < N1e9 )
+#endif
+      )
+	{    
+		screwed_up += 50;
+		memset(out, 0, framesize * sizeof(spx_int16_t));
+
+	} else
+	{	screwed_up = mux( SHR32(Sff, 2) > ADD32(Sdd, SHR32(MULT16_16(N, 10000),6)), screwed_up+1, 0);
+	}
+
+	st->screwed_up = screwed_up;
+
+	return screwed_up;
+}
+
+void mdf_smooth(
+	spx_word32_t * restrict power,
+	spx_word32_t * restrict Xf,
+	int framesize,
+	int M
+)
+{
+    register spx_word16_t ss, ss_1, pf, xff;
+	register int j;
+
+#ifdef FIXED_POINT
+	ss=DIV32_16(11469,M);
+	ss_1 = SUB16(32767,ss);
+#else
+	ss=.35/M;
+	ss_1 = 1-ss;
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  
+	for ( j=0 ; j<framesize ; ++j )
+	{	register spx_word32_t pi  = power[j];
+		register spx_word32_t xfi = Xf[j];
+
+		power[j] = MULT16_32_Q15(ss_1,pi) + 1 + MULT16_32_Q15(ss,xfi);
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif 
+
+	pf = power[framesize];
+	xff = Xf[framesize];
+	power[framesize] = MULT16_32_Q15(ss_1,pf) + 1 + MULT16_32_Q15(ss,xff);
+}
+
+void mdf_compute_filtered_spectra_crosscorrelations(
+ 	SpeexEchoState * restrict st,
+	spx_word32_t Syy, 
+	spx_word32_t See,
+	int framesize
+)
+{   
+	register spx_float_t Pey = FLOAT_ONE;
+	register spx_float_t Pyy = FLOAT_ONE;
+	register spx_word16_t spec_average = st->spec_average;
+	register spx_word32_t * restrict pRf = st->Rf;
+	register spx_word32_t * restrict pYf = st->Yf;
+	register spx_word32_t * restrict pEh = st->Eh;
+	register spx_word32_t * restrict pYh = st->Yh;
+	register spx_word16_t beta0 = st->beta0;
+	register spx_word16_t beta_max = st->beta_max;
+	register spx_float_t alpha, alpha_1;
+	register spx_word32_t tmp32, tmpx;
+	register spx_float_t sPey = st->Pey;
+	register spx_float_t sPyy = st->Pyy;
+	register spx_float_t tmp;
+	register spx_word16_t leak_estimate;
+	register int j;
+	register spx_float_t  Eh, Yh;
+	register spx_word32_t _Ehj, _Rfj, _Yfj, _Yhj;
+
+#ifdef FIXED_POINT
+	register spx_word16_t spec_average1 = SUB16(32767,spec_average);
+#else
+	register spx_word16_t spec_average1 = 1 - spec_average;
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  
+	for (j=framesize; j>0 ; --j)
+	{
+		_Ehj = pEh[j];
+		_Rfj = pRf[j];
+		_Yfj = pYf[j];
+		_Yhj = pYh[j];
+
+		Eh = PSEUDOFLOAT(_Rfj - _Ehj);
+		Yh = PSEUDOFLOAT(_Yfj - _Yhj);
+
+		Pey = FLOAT_ADD(Pey,FLOAT_MULT(Eh,Yh));
+		Pyy = FLOAT_ADD(Pyy,FLOAT_MULT(Yh,Yh));
+
+		pEh[j] = MAC16_32_Q15(MULT16_32_Q15(spec_average1, _Ehj), spec_average, _Rfj);
+		pYh[j] = MAC16_32_Q15(MULT16_32_Q15(spec_average1, _Yhj), spec_average, _Yfj);
+   }
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif 
+	_Ehj = pEh[0];
+	_Rfj = pRf[0];
+	_Yfj = pYf[0];
+	_Yhj = pYh[0];
+
+	Eh = PSEUDOFLOAT(_Rfj - _Ehj);
+	Yh = PSEUDOFLOAT(_Yfj - _Yhj);
+
+	Pey = FLOAT_ADD(Pey,FLOAT_MULT(Eh,Yh));
+	Pyy = FLOAT_ADD(Pyy,FLOAT_MULT(Yh,Yh));
+
+	pEh[0] = MAC16_32_Q15(MULT16_32_Q15(spec_average1, _Ehj), spec_average, _Rfj);
+	pYh[0] = MAC16_32_Q15(MULT16_32_Q15(spec_average1, _Yhj), spec_average, _Yfj);
+
+	Pyy = FLOAT_SQRT(Pyy);
+	Pey = FLOAT_DIVU(Pey,Pyy);
+
+	tmp32 = MULT16_32_Q15(beta0,Syy);
+	tmpx = MULT16_32_Q15(beta_max,See);
+	tmp32 = VMUX(tmp32 > tmpx, tmpx, tmp32);
+	alpha = FLOAT_DIV32(tmp32, See);
+	alpha_1 = FLOAT_SUB(FLOAT_ONE, alpha);
+   
+	sPey = FLOAT_ADD(FLOAT_MULT(alpha_1,sPey) , FLOAT_MULT(alpha,Pey));
+	sPyy = FLOAT_ADD(FLOAT_MULT(alpha_1,sPyy) , FLOAT_MULT(alpha,Pyy));   
+	tmp = FLOAT_MULT(MIN_LEAK,sPyy);
+
+#ifndef FIXED_POINT
+	sPyy = VMUX(FLOAT_LT(sPyy, FLOAT_ONE), FLOAT_ONE, sPyy);
+	sPey = VMUX(FLOAT_LT(sPey, tmp), tmp, sPey);
+	sPey = VMUX(FLOAT_LT(sPey, sPyy), sPey, sPyy); 
+#else
+	sPyy = FLOAT_LT(sPyy, FLOAT_ONE) ? FLOAT_ONE : sPyy;
+	sPey = FLOAT_LT(sPey, tmp) ? tmp : sPey;
+	sPey = FLOAT_LT(sPey, sPyy) ? sPey : sPyy; 
+#endif
+	
+	leak_estimate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIVU(sPey, sPyy),14));
+
+	leak_estimate = VMUX( leak_estimate > 16383, 32767, SHL16(leak_estimate,1));
+	st->Pey = sPey;
+	st->Pyy = sPyy;
+	st->leak_estimate = leak_estimate;
+}
+
+inline spx_word16_t mdf_compute_RER(
+ 	spx_word32_t See,
+	spx_word32_t Syy,
+	spx_word32_t Sey,
+	spx_word32_t Sxx,
+	spx_word16_t leake
+)
+{
+	register spx_word16_t RER;
+
+#ifdef FIXED_POINT
+	register spx_word32_t tmp32;
+	register spx_word32_t tmp;
+	spx_float_t bound = PSEUDOFLOAT(Sey);
+	      
+	tmp32 = MULT16_32_Q15(leake,Syy);
+	tmp32 = ADD32(SHR32(Sxx,13), ADD32(tmp32, SHL32(tmp32,1)));
+   
+	bound = FLOAT_DIVU(FLOAT_MULT(bound, bound), PSEUDOFLOAT(ADD32(1,Syy)));
+	tmp   = FLOAT_EXTRACT32(bound);
+	tmp32 = imux( FLOAT_GT(bound, PSEUDOFLOAT(See)), See,
+			imux( tmp32 < tmp,  tmp, tmp32));
+
+	tmp   = SHR32(See,1);
+	tmp32 = imux(tmp32 > tmp, tmp, tmp32);
+	RER   = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32,See),15));
+#else
+	register spx_word32_t r0;
+
+	r0 = (Sey * Sey)/(1 + See * Syy);
+
+	RER = (.0001*Sxx + 3.* MULT16_32_Q15(leake,Syy)) / See;
+	RER = fmux( RER < r0, r0, RER);
+	RER = fmux( RER > .5, .5, RER);
+#endif
+
+	return RER;
+}
+
+void mdf_adapt(
+ 	SpeexEchoState * restrict st,
+	spx_word16_t RER,
+	spx_word32_t Syy,
+	spx_word32_t See, 
+	spx_word32_t Sxx
+)
+{
+	register spx_float_t  * restrict power_1 = st->power_1;
+	register spx_word32_t  * restrict power = st->power;
+	register int adapted = st->adapted;
+	register spx_word32_t sum_adapt = st->sum_adapt;
+	register spx_word16_t leake = st->leak_estimate;
+	register int framesize = st->frame_size;
+	register int i;
+	register int M = st->M;
+
+	adapted = mux( !adapted && sum_adapt > QCONST32(M,15) && 
+					MULT16_32_Q15(leake,Syy) > MULT16_32_Q15(QCONST16(.03f,15),Syy), 1, adapted);
+
+	if ( adapted )
+	{	register spx_word32_t * restrict Yf = st->Yf; 
+		register spx_word32_t * restrict Rf = st->Rf;
+		register spx_word32_t r, e, e2;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  
+		for ( i=0 ; i<framesize ; ++i )
+		{	
+			r = SHL32(Yf[i],3);
+			r = MULT16_32_Q15(leake,r);
+			e = SHL32(Rf[i],3)+1;
+
+#ifdef FIXED_POINT
+			e2 = SHR32(e,1);
+			r = mux( r > e2, e2, r);
+#else
+			e2 = e * .5;
+			r = fmux( r > e2, e2, r);
+#endif
+
+			r = MULT16_32_Q15(QCONST16(.7,15),r) + 
+				MULT16_32_Q15(QCONST16(.3,15),(spx_word32_t)(MULT16_32_Q15(RER,e)));
+
+			power_1[i] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r,FLOAT_MUL32U(e,power[i]+10)),WEIGHT_SHIFT+16);
+		}
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif 
+		
+		r = SHL32(Yf[framesize],3);
+		r = MULT16_32_Q15(leake,r);
+		e = SHL32(Rf[framesize],3)+1;
+
+#ifdef FIXED_POINT
+		e2 = SHR32(e,1);
+		r = mux( r > e2, e2, r);
+#else
+		e2 = e * .5;
+		r = fmux( r > e2, e2, r);
+#endif
+
+		r = MULT16_32_Q15(QCONST16(.7,15),r) + 
+			MULT16_32_Q15(QCONST16(.3,15),(spx_word32_t)(MULT16_32_Q15(RER,e)));
+
+		power_1[framesize] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r,FLOAT_MUL32U(e,power[framesize]+10)),WEIGHT_SHIFT+16);
+
+	} else 
+	{
+		register spx_word16_t adapt_rate=0;
+		register int N = st->window_size;
+
+		if ( Sxx > SHR32(MULT16_16(N, 1000),6) ) 
+		{	register spx_word32_t tmp32, tmp32q;
+         
+			tmp32 = MULT16_32_Q15(QCONST16(.25f, 15), Sxx);
+#ifdef FIXED_POINT
+			tmp32q = SHR32(See,2);
+			tmp32  = mux(tmp32 > tmp32q, tmp32q, tmp32);
+#else
+			tmp32q = 0.25 * See;
+			tmp32 = fmux(tmp32 > tmp32q, tmp32q, tmp32);
+#endif
+			adapt_rate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32, See),15));
+		}
+      
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  		
+		for (i=0;i<framesize;i++)
+			power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate),ADD32(power[i],10)),WEIGHT_SHIFT+1);
+#if (TM_UNROLL && TM_UNROLL_SPEEXECHOCANCELLATION)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif 
+		power_1[framesize] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate),ADD32(power[framesize],10)),WEIGHT_SHIFT+1);
+		sum_adapt = ADD32(sum_adapt,adapt_rate);
+	}
+	
+	st->sum_adapt = sum_adapt;
+	st->adapted = adapted;
+}
+
+#define OVERRIDE_ECHO_CANCELLATION
+void speex_echo_cancellation(
+	SpeexEchoState * restrict st, 
+	const spx_int16_t * restrict in, 
+	const spx_int16_t * restrict far_end, 
+	spx_int16_t * restrict out
+)
+{ 
+   	register int framesize = st->frame_size;
+   	register spx_word16_t * restrict x = st->x;
+	register spx_word16_t * restrict xx = st->x + framesize;
+	register spx_word16_t * restrict yy = st->y + framesize;
+	register spx_word16_t * restrict ee = st->e + framesize;
+	register spx_word32_t Syy, See, Sxx, Sdd, Sff;
+	register spx_word16_t RER;
+	register spx_word32_t Sey;
+	register int j;
+	register int N,M;
+#ifdef TWO_PATH
+	register spx_word32_t Dbf;
+#endif
+
+	N = st->window_size;
+	M = st->M;
+	st->cancel_count++;
+
+	filter_dc_notch16(in, st->notch_radius, st->input, framesize, st->notch_mem);
+	mdf_preemph(st, xx, far_end, framesize);
+   
+   	{
+
+	register spx_word16_t * restrict X = st->X;
+
+	for ( j=M-1 ; j>=0 ; j-- )
+	{	register spx_word16_t * restrict Xdes = &(X[(j+1)*N]);
+		register spx_word16_t * restrict Xsrc = &(X[j*N]);
+
+		memcpy(Xdes, Xsrc, N * sizeof(spx_word16_t));
+	}
+
+	spx_fft(st->fft_table, x, X);
+	memcpy(st->last_y, st->x, N * sizeof(spx_word16_t));
+	Sxx = mdf_inner_prod(xx, xx, framesize);
+	memcpy(x, xx, framesize * sizeof(spx_word16_t));
+	
+#ifdef TWO_PATH
+	spectral_mul_accum(st->X, st->foreground, st->Y, N, M);   
+	spx_ifft(st->fft_table, st->Y, st->e);
+	mdf_sub(xx, st->input, ee, framesize);
+	Sff = mdf_inner_prod(xx, xx, framesize);
+#endif
+   
+	mdf_adjust_prop (st->W, N, M, st->prop);
+  
+	if (st->saturated == 0)
+	{	mdf_compute_weight_gradient(st, X, N, M);
+	} else 
+	{	st->saturated--;
+	}
+	}
+
+	mdf_update_weight(st, N, M, framesize);
+	spectral_mul_accum(st->X, st->W, st->Y, N, M);
+	spx_ifft(st->fft_table, st->Y, st->y);
+
+#ifdef TWO_PATH
+	mdf_sub(xx, ee, yy, framesize);   
+	Dbf = 10+mdf_inner_prod(xx, xx, framesize);
+#endif
+
+	mdf_sub(xx, st->input, yy, framesize);
+	See = mdf_inner_prod(xx, xx, framesize);
+
+#ifndef TWO_PATH
+	Sff = See;
+#else
+	See = mdf_update_foreground(st,Dbf,Sff,See);
+#endif
+
+	
+	mdf_compute_error_signal(st, in, out, framesize);
+	Sey = mdf_inner_prod(ee, yy, framesize);
+	Syy = mdf_inner_prod(yy, yy, framesize);
+	Sdd = mdf_inner_prod(st->input, st->input, framesize);
+   
+	if ( mdf_check(st,out,Syy,Sxx,See,Sff,Sdd) >= 50 )
+	{	speex_warning("The echo canceller started acting funny and got slapped (reset). It swears it will behave now.");
+		speex_echo_state_reset(st);
+		return;
+	}
+
+	See = MAX32(See, SHR32(MULT16_16(N, 100),6));
+	spx_fft(st->fft_table, st->e, st->E);
+	memset(st->y, 0, framesize * sizeof(spx_word16_t));
+	spx_fft(st->fft_table, st->y, st->Y);
+	power_spectrum(st->E, st->Rf, N);
+	power_spectrum(st->Y, st->Yf, N);
+	power_spectrum(st->X, st->Xf, N);
+
+	mdf_smooth(st->power,st->Xf,framesize, M);
+	mdf_compute_filtered_spectra_crosscorrelations(st,Syy,See,framesize);
+	RER = mdf_compute_RER(See,Syy,Sey,Sxx,st->leak_estimate);
+	mdf_adapt(st, RER, Syy, See, Sxx);
+	
+	if ( st->adapted )
+	{	register spx_word16_t * restrict last_yy = st->last_y + framesize;
+		
+		memcpy(st->last_y, last_yy, framesize * sizeof(spx_word16_t));
+		mdf_sub_int(last_yy, in, out, framesize);
+
+	}
+}
+
+
+
diff --git a/tmv/misc_tm.h b/tmv/misc_tm.h
new file mode 100644
index 0000000..4d6cea7
--- /dev/null
+++ b/tmv/misc_tm.h
@@ -0,0 +1,92 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file misc_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <tmml.h>
+
+
+#if TM_PROFILE
+int __profile_begin;
+int __profile_end;
+#endif
+
+#define OVERRIDE_SPEEX_ALLOC
+void *speex_alloc (int size)
+{
+	void *ptr;
+
+	if ( tmmlMalloc(0, size, (pVoid*)&ptr, tmmlMallocCacheAligned | tmmlMallocCleared) != TM_OK )
+	{	return NULL;
+	}
+
+	return ptr;
+}
+
+
+#define OVERRIDE_SPEEX_ALLOC_SCRATCH
+void *speex_alloc_scratch (int size)
+{
+	void *ptr;
+
+	if ( tmmlMalloc(0, size, (pVoid*)&ptr, tmmlMallocCacheAligned | tmmlMallocCleared) != TM_OK )
+	{	return NULL;
+	}
+
+	return ptr;
+}
+
+
+#define OVERRIDE_SPEEX_REALLOC
+void *speex_realloc (void *ptr, int size)
+{
+	if ( tmmlRealloc(0, size, (pVoid)ptr, (pVoid*)&ptr, tmmlMallocCacheAligned | tmmlMallocCleared) != TM_OK )
+	{	return NULL;
+	}
+
+	return ptr;
+}
+
+
+#define OVERRIDE_SPEEX_FREE
+void speex_free (void *ptr)
+{
+	tmmlFree(ptr);
+}
+
+
+#define OVERRIDE_SPEEX_FREE_SCRATCH
+void speex_free_scratch (void *ptr)
+{
+	tmmlFree(ptr);
+}
+
diff --git a/tmv/preprocess_tm.h b/tmv/preprocess_tm.h
new file mode 100644
index 0000000..f903b73
--- /dev/null
+++ b/tmv/preprocess_tm.h
@@ -0,0 +1,1135 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file preprocess_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+#define OVERRIDE_PREPROCESS_ANALYSIS
+static void preprocess_analysis(SpeexPreprocessState * restrict st, spx_int16_t * restrict x)
+{
+	register int i, j, framesize = st->frame_size;
+	register int N = st->ps_size;
+	register int N3 = 2*N - framesize;
+	register int N4 = framesize - N3;
+	register int * restrict ps = st->ps;
+	register int * restrict frame;
+	register int * restrict inbuf;
+	register int * restrict ptr;
+	register int max_val;
+
+	frame = (int*)(st->frame);
+	inbuf = (int*)(st->inbuf);
+	ptr = (int*)(st->frame+N3);
+
+	TMDEBUG_ALIGNMEM(x);
+   	TMDEBUG_ALIGNMEM(frame);
+	TMDEBUG_ALIGNMEM(inbuf);
+
+	PREPROCESSANAYLSIS_START();
+
+	N3 >>= 1;
+	framesize >>= 1;
+	max_val = 0;
+	
+	for ( i=0,j=0 ; i<N3 ; i+=2,j+=8 )
+	{	register int r1, r2;
+
+		r1 = ld32x(inbuf,i);
+		r2 = ld32x(inbuf,i+1);
+
+		st32d(j,   frame, r1);
+		st32d(j+4, frame, r2);
+	}
+
+	for ( i=0,j=0 ; i<framesize ; i+=2,j+=8 )
+	{	register int r1, r2;
+
+		r1 = ld32x(x, i);
+		r2 = ld32x(x, i+1);
+
+		st32d(j,  ptr, r1);
+		st32d(j+4,ptr, r2);
+	}
+   
+	for ( i=0,j=0,ptr=(int*)(x+N4) ; i<N3 ; i+=2,j+=8 )
+	{	register int r1, r2;
+
+		r1 = ld32x(ptr, i);
+		r2 = ld32x(ptr, i+1);
+
+		st32d(j,  inbuf, r1);
+		st32d(j+4,inbuf, r2);
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+   for ( i=0,j=0,ptr=(int*)st->window ; i<N ; ++i,j+=4 )
+   {	register int f10, w10, r0, r1;
+
+		f10 = ld32x(frame, i);
+		w10 = ld32x(ptr,   i);
+
+		r0  = (sex16(f10) * sex16(w10)) >> 15;
+		r1	= (asri(16,f10) * asri(16,w10)) >> 15;
+
+		max_val = imax(iabs(sex16(r0)), max_val);
+		max_val = imax(iabs(sex16(r1)), max_val);
+
+		r0	= pack16lsb(r1, r0);
+		st32d(j, frame, r0);
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+   
+	max_val = 14 - spx_ilog2(max_val);
+	st->frame_shift = max_val;
+
+	if ( max_val != 0 )
+	{
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0,j=0 ; i<N ; ++i,j+=4 )
+		{	register int f10;
+
+			f10 = ld32x(frame, i);
+			f10 = dualasl(f10, max_val);
+			st32d(j, frame, f10);
+
+		}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+
+	spx_fft(st->fft_lookup, st->frame, st->ft);     
+	power_spectrum(st->ft, ps, N << 1);
+   
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,ptr=(int*)st->ps,max_val<<=1,j=((1<<((max_val))>>1)) ;i<N ; ++i )
+	{	
+		ps[i] = (ps[i] + j) >> max_val;
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	filterbank_compute_bank32(st->bank, ps, ps+N);
+
+	PREPROCESSANAYLSIS_STOP();
+}
+
+#define _MULT16_32_Q15(a,b,c) ADD32(MULT16_16((a),(b)), SHR(MULT16_16((a),(c)),15))
+
+#define OVERRIDE_UPDATE_NOISE_PROB
+static void update_noise_prob(SpeexPreprocessState * restrict st)
+{
+	register int i;
+	register int min_range, nb_adapt;
+	register int N = st->ps_size;
+	register int * restrict Smin = (int*)st->Smin;
+	register int * restrict Stmp = (int*)st->Stmp;
+	register int * restrict S = (int*)st->S;
+	
+	UPDATENOISEPROB_START();
+
+	{
+		register int psi_lsb, psi_msb, ips_lsb, ips_msb, psii_lsb, psii_msb;
+		register int psiii_lsb, psiii_msb;
+		register int q8, q05, q2, q1;
+		register int *ps = (int*)st->ps;
+		register int si_lsb, si_msb, sii_lsb, sii_msb;
+	
+		q8	= QCONST16(.8f,15);
+		q05 = QCONST16(.05f,15);
+		q2	= QCONST16(.2f,15);
+		q1	= QCONST16(.1f,15);
+
+		ips_lsb	 = ps[0];
+		psi_lsb	 = ps[1];
+		si_lsb	 = S[0];
+		ips_msb	 = ips_lsb >> 15;
+		psi_msb	 = psi_lsb >> 15;
+		si_msb	 = si_lsb >> 15;
+
+		ips_lsb	 &= 0x00007fff;
+		psi_lsb	 &= 0x00007fff;
+		si_lsb	 &= 0x00007fff;
+		
+		S[0] = _MULT16_32_Q15(q8,si_msb,si_lsb) +  _MULT16_32_Q15(q2,ips_msb,ips_lsb);
+
+		for ( i=1 ; i<N-1 ; i+=2 )
+		{
+			psii_lsb = ps[i+1];
+			si_lsb	 = S[i];
+
+			psii_msb = psii_lsb >> 15;
+			si_msb	 = si_lsb >> 15;
+			si_lsb	 &= 0x00007fff;
+			psii_lsb &= 0x00007fff;
+
+			S[i]= _MULT16_32_Q15(q8,si_msb,si_lsb) + 
+				  _MULT16_32_Q15(q05,ips_msb,ips_lsb) + 
+				  _MULT16_32_Q15(q1,psi_msb,psi_lsb) + 
+				  _MULT16_32_Q15(q05,psii_msb,psii_lsb);
+			
+			psiii_lsb = ps[i+2];
+			sii_lsb	 = S[i+1];
+
+			sii_msb	 =  sii_lsb >> 15;
+			psiii_msb=  psiii_lsb >> 15;
+			sii_lsb	 &= 0x00007fff;
+			psiii_lsb&= 0x00007fff;
+
+			S[i+1]= _MULT16_32_Q15(q8,sii_msb,sii_lsb) + 
+					_MULT16_32_Q15(q05,psi_msb,psi_lsb) + 
+					_MULT16_32_Q15(q1,psii_msb,psii_lsb) + 
+					_MULT16_32_Q15(q05,psiii_msb,psiii_lsb);
+
+			ips_lsb = psii_lsb;
+			ips_msb = psii_msb;
+			psi_lsb = psiii_lsb;
+			psi_msb = psiii_msb;
+		}
+   
+		S[N-1] = MULT16_32_Q15(q8,S[N-1]) + MULT16_32_Q15(q2,ps[N-1]);
+	}
+
+	nb_adapt = st->nb_adapt;
+   
+	if ( nb_adapt==1 )
+	{	for ( i=0 ; i<N ; ++i )
+			Smin[i] = Stmp[i] = 0;
+
+	}
+
+	min_range = mux(nb_adapt < 100,		15,
+				mux(nb_adapt < 1000,	50,
+				mux(nb_adapt < 10000,	150, 300)));
+
+	if ( st->min_count > min_range )
+	{
+		st->min_count = 0;
+
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<N ; ++i )
+		{	register int si, stmpi;
+
+			si		= S[i];
+			stmpi	= Stmp[i];
+				
+			Smin[i] = imin(stmpi,si);
+			Stmp[i] = si;
+		}
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	} else 
+	{
+
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<N ; ++i )
+		{	register int si, stmpi, smini;
+
+			si		= S[i];
+			stmpi	= Stmp[i];
+			smini	= Smin[i];
+
+			Smin[i] = imin(smini,si);
+			Stmp[i] = imin(stmpi,si);      
+		}
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+
+
+	{
+		register int q4;
+		register int * restrict update_prob = (int*)st->update_prob;
+
+		q4 = QCONST16(.4f,15);
+
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<N ; ++i )
+		{	register int si;
+			register int smini;
+
+			si = S[i];
+			smini = Smin[i];
+			update_prob[i] = mux(MULT16_32_Q15(q4,si) > ADD32(smini,20), 1, 0);
+		}
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+
+	UPDATENOISEPROB_STOP();
+}
+
+#else
+
+#define OVERRIDE_PREPROCESS_ANALYSIS
+static void preprocess_analysis(SpeexPreprocessState * restrict st, spx_int16_t * restrict x)
+{
+	register int i;
+	register int framesize = st->frame_size;
+	register int N = st->ps_size;
+	register int N3 = 2*N - framesize;
+	register int N4 = framesize - N3;
+	register float * restrict ps = st->ps;
+	register float * restrict frame = st->frame;
+	register float * restrict inbuf = st->inbuf;
+
+	PREPROCESSANAYLSIS_START();
+
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<N3 ; ++i )
+	{
+		frame[i] = inbuf[i];
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+   	for ( i=0 ; i<framesize ; ++i )
+	{	frame[N3+i] = x[i];
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,x+=N4 ; i<N3 ; ++i )
+	{	inbuf[i]  = x[i];
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	inbuf = st->window;
+
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<2*N ; ++i )
+	{
+		frame[i] = frame[i] * inbuf[i];
+	}
+#if (TM_UNROLL && TM_UNROLL_PREPROCESSANALYSIS)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	spx_fft(st->fft_lookup, frame, st->ft);         
+	power_spectrum(st->ft, ps, N << 1);
+	filterbank_compute_bank32(st->bank, ps, ps+N);
+
+	PREPROCESSANAYLSIS_STOP();
+}
+
+
+#define OVERRIDE_UPDATE_NOISE_PROB
+static void update_noise_prob(SpeexPreprocessState * restrict st)
+{
+   
+	register float * restrict S = st->S;
+   	register float * restrict ps = st->ps;
+	register int N = st->ps_size;
+    register int min_range;
+	register int i;
+	register int nb_adapt;
+	register float * restrict Smin = st->Smin;
+	register float * restrict Stmp = st->Stmp;
+
+	UPDATENOISEPROB_START();
+
+	{
+		register float ips, psi;
+
+		ips  = ps[0];
+		psi  = ps[1];
+
+		S[0] = .8f * S[0] + .2f * ips;
+
+		for ( i=1 ; i<N-1 ; i+=2 )
+		{
+			register float psii, psiii;
+
+			psii	= ps[i+1];
+			psiii	= ps[i+2];
+			S[i]	= .8f * S[i]	+ .05f * ips + .1f * psi  + .05f * psii;
+			S[i+1]	= .8f * S[i+1]	+ .05f * psi + .1f * psii + .05f * psiii; 
+			ips		= psii;
+			psi		= psiii;
+		}
+
+		S[N-1] = .8f * S[N-1] + .2f * ps[N-1];
+	}
+
+	nb_adapt = st->nb_adapt;
+   
+   	if ( nb_adapt==1 )
+	{	
+		for (i=0;i<N;i++)
+			Smin[i] = st->Stmp[i] = 0;
+	}
+
+	min_range = mux(nb_adapt < 100,		15,
+				mux(nb_adapt < 1000,	50,
+				mux(nb_adapt < 10000,	150, 300)));
+
+   
+	if ( st->min_count > min_range )
+	{
+		st->min_count = 0;
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<N ; ++i )
+		{
+			register float stmpi, si;
+
+			stmpi	= Stmp[i];
+			si		= S[i];
+
+			Smin[i] = fmin(stmpi,si);
+			Stmp[i] = si;
+		}
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	} else 
+	{
+		register float * restrict Smin = st->Smin;
+
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for ( i=0 ; i<N ; ++i )
+		{
+			register float stmpi, si, smini;
+
+			stmpi	= Stmp[i];
+			si		= S[i];
+			smini	= Smin[i];
+			
+			Smin[i] = fmin(smini,si);
+			Stmp[i] = fmin(stmpi,si);      
+		}
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+
+	{
+		register int * restrict update_prob = (int*)st->update_prob;
+
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for (i=0;i<N;i++)
+		{	register float si;
+			register float smini;
+
+			si		= S[i];
+			smini	= Smin[i];
+			update_prob[i] = mux( (.4 * si) > (smini + 20.f), 1, 0);
+			
+		}
+#if (TM_UNROLL && TM_UNROLL_UPDATENOISEPROB)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+
+	UPDATENOISEPROB_STOP();
+}
+
+
+#define OVERRIDE_COMPUTE_GAIN_FLOOR
+static void compute_gain_floor(
+	int noise_suppress, 
+	int effective_echo_suppress, 
+	float * restrict noise, 
+	float * restrict echo, 
+	float * gain_floor, 
+	int len
+)
+{
+	register int i;
+	register float echo_floor;
+	register float noise_floor;
+
+	COMPUTEGAINFLOOR_START();
+
+	noise_floor = exp(.2302585f*noise_suppress);
+	echo_floor = exp(.2302585f*effective_echo_suppress);
+
+#if (TM_UNROLL && TM_UNROLL_COMPUTEGAINFLOOR)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for (i=0;i<len;i++)
+	{	register float noisei, echoi;
+
+		noisei = noise[i];
+		echoi  = echo[i];
+
+		gain_floor[i] = FRAC_SCALING * sqrt(noise_floor * noisei +	echo_floor * echoi) / sqrt(1+noisei+echoi);
+
+	}
+#if (TM_UNROLL && TM_UNROLL_COMPUTEGAINFLOOR)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	COMPUTEGAINFLOOR_STOP();
+}
+
+#endif
+
+static inline spx_word32_t hypergeom_gain(spx_word32_t xx);
+static inline spx_word16_t qcurve(spx_word16_t x);
+static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len);
+void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
+
+#ifndef FIXED_POINT
+static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft);
+#endif
+
+void preprocess_residue_echo(
+	SpeexPreprocessState * restrict st,
+	int N,
+	int NM
+)
+{
+	if (st->echo_state)
+	{
+		register spx_word32_t * restrict r_echo = st->residual_echo;
+		register spx_word32_t * restrict e_noise = st->echo_noise;
+		register int i;
+
+#ifndef FIXED_POINT
+		register spx_word32_t r;
+#endif
+
+		speex_echo_get_residual(st->echo_state, r_echo, N);
+
+#ifndef FIXED_POINT
+		r = r_echo[0];
+		if (!(r >=0 && r < N*1e9f) )
+		{	
+			memset(r_echo, 0, N * sizeof(spx_word32_t));
+		}
+#endif
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+		for (i=0;i<N;i++)
+		{	register spx_word32_t eni = e_noise[i];
+			e_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),eni), r_echo[i]);
+		}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+		filterbank_compute_bank32(st->bank, e_noise, e_noise+N);
+   
+	} else 
+	{	memset(st->echo_noise, 0, (NM) * sizeof(spx_word32_t));
+	}
+}
+
+void preprocess_update_noise(
+	SpeexPreprocessState * restrict st,
+	spx_word32_t * restrict ps,
+	int N
+)
+{
+	register spx_word16_t beta, beta_1;
+	register int * restrict up = st->update_prob;
+	register spx_word32_t * restrict noise = st->noise;
+	register int i;
+
+	beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
+	beta_1 = Q15_ONE-beta;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for (i=0;i<N;i++)
+	{	register spx_word32_t ni = noise[i];
+		register spx_word32_t psi = ps[i];
+
+		if ( !up[i] || psi < PSHR32(ni, NOISE_SHIFT) )
+		{	noise[i] =	MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,ni) + 
+						MULT16_32_Q15(beta,SHL32(psi,NOISE_SHIFT)));
+		}
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	filterbank_compute_bank32(st->bank, noise, noise+N);
+}
+
+void preprocess_compute_SNR(
+	SpeexPreprocessState * restrict st,
+	spx_word32_t * restrict ps,
+	int NM
+)
+{
+	register spx_word32_t * restrict noise = st->noise;
+	register spx_word32_t * restrict echo = st->echo_noise;
+	register spx_word32_t * restrict reverb = st->reverb_estimate;
+	register spx_word16_t * restrict post = st->post;
+	register spx_word32_t * restrict old_ps = st->old_ps;
+	register spx_word16_t * restrict prior = st->prior;
+	register int i;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0 ; i<NM ; i++)
+	{
+		register spx_word16_t gamma;
+		register spx_word32_t tot_noise;
+		register spx_word16_t posti;
+		register spx_word32_t opsi;
+		register spx_word16_t priori;
+
+		tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(noise[i],NOISE_SHIFT)), echo[i]) , reverb[i]);
+      
+		posti = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
+		posti = MIN16(posti, QCONST16(100.f,SNR_SHIFT));
+		post[i] = posti;
+
+		opsi = old_ps[i];
+
+		gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(opsi,ADD32(opsi,tot_noise))));
+      
+		priori = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,posti)), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(opsi,tot_noise))), 15));
+		prior[i]=MIN16(priori, QCONST16(100.f,SNR_SHIFT));
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+spx_word32_t preprocess_smooth_SNR(
+	SpeexPreprocessState * restrict st,
+	int N,
+	int NM
+)
+{
+	register spx_word16_t * restrict zeta = st->zeta;
+	register spx_word16_t * restrict prior = st->prior;
+	register spx_word32_t Zframe;
+	register spx_word16_t iprior, priori;
+	register int _N = N-1;
+	register int i;
+
+	iprior = prior[0];
+	priori = prior[1];
+	zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),zeta[0]), MULT16_16(QCONST16(.3f,15),iprior)),15);
+   
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=1 ; i<_N ; i++)
+	{	register spx_word16_t zetai = zeta[i];
+		register spx_word16_t priorii = prior[i+1];
+
+		zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),zetai), MULT16_16(QCONST16(.15f,15),priori)),
+                  MULT16_16(QCONST16(.075f,15),iprior)), MULT16_16(QCONST16(.075f,15),priorii)),15);
+
+		iprior = priori;
+		priori = priorii;
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	for (i=_N; i<NM ; i++)
+	{	register spx_word16_t zetai = zeta[i];
+		
+		priori = prior[i];
+		zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),zetai), MULT16_16(QCONST16(.3f,15),priori)),15);
+	}
+
+	Zframe = 0;
+   
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=N ; i<NM ; i++ )
+	{	Zframe = ADD32(Zframe, EXTEND32(zeta[i]));
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+	return Zframe;
+}
+
+void preprocess_compute_emgain(
+	SpeexPreprocessState * restrict st,
+	spx_word32_t * restrict ps,
+	spx_word16_t Pframe,
+	int NM
+)
+{
+	register spx_word16_t * restrict zeta = st->zeta;
+	register spx_word16_t * restrict prior = st->prior;
+	register spx_word16_t * restrict gain = st->gain;
+	register spx_word32_t * restrict old_ps = st->old_ps;
+	register spx_word16_t * restrict post = st->post;
+	register spx_word16_t * restrict gain2 = st->gain2;
+	register int i;
+	register int N=st->ps_size;
+	
+	for ( i=N ; i<NM ; ++i )
+	{
+		register spx_word32_t theta;
+		register spx_word32_t MM;
+		register spx_word16_t prior_ratio;
+		register spx_word16_t P1;
+		register spx_word16_t q;
+
+#ifdef FIXED_POINT
+		register spx_word16_t tmp;
+#endif
+		register spx_word16_t priori = prior[i];
+		
+		prior_ratio = PDIV32_16(SHL32(EXTEND32(priori), 15), ADD16(priori, SHL32(1,SNR_SHIFT)));
+		theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(post[i]),EXPIN_SHIFT-SNR_SHIFT));
+
+		MM = hypergeom_gain(theta);
+		gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
+		old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(gain[i])),ps[i]);
+
+		P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (zeta[i]));
+		q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
+
+#ifdef FIXED_POINT
+		theta = MIN32(theta, EXTEND32(32767));
+		tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+priori),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
+		tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp);
+		tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
+		gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
+#else
+		gain2[i]=1/(1.f + (q/(1.f-q))*(1+priori)*exp(-theta));
+#endif
+	}
+
+	filterbank_compute_psd16(st->bank,gain2+N, gain2);
+	filterbank_compute_psd16(st->bank,gain+N, gain);
+}
+
+void preprocess_compute_linear_gain(
+	SpeexPreprocessState * restrict st,
+	spx_word32_t * restrict ps,	
+	int N
+)
+{
+	register spx_word16_t * restrict gain_floor = st->gain_floor;
+	register spx_word16_t * restrict prior = st->prior;
+	register spx_word16_t * restrict gain = st->gain;
+	register spx_word32_t * restrict old_ps = st->old_ps;
+	register spx_word16_t * restrict post = st->post;
+	register spx_word16_t * restrict gain2 = st->gain2;
+	register int i;
+
+	filterbank_compute_psd16(st->bank,gain_floor+N,gain_floor);
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif   
+	for (i=0;i<N;i++)
+    {
+         register spx_word32_t MM;
+         register spx_word32_t theta;
+         register spx_word16_t prior_ratio;
+         register spx_word16_t tmp;
+         register spx_word16_t p;
+         register spx_word16_t g;
+		 register spx_word16_t gfi = gain_floor[i];
+
+		 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(prior[i], SHL32(1,SNR_SHIFT)));
+         theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(post[i]),EXPIN_SHIFT-SNR_SHIFT));
+		 MM = hypergeom_gain(theta);
+
+		 g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
+		 p = gain2[i];
+         
+		 g = VMUX( MULT16_16_Q15(QCONST16(.333f,15),g) > gain[i], MULT16_16(3,gain[i]), g);
+
+         old_ps[i]= MULT16_32_P15(QCONST16(.2f,15),old_ps[i]) + 
+					MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(g)),ps[i]);
+         
+		 g = VMUX( g < gfi, gfi, g );
+         gain[i] = g;
+
+         tmp =	MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(g),15))) + 
+				MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(gfi),15)));
+
+         gain2[i]=SQR16_Q15(tmp);
+
+         /* Use this if you want a log-domain MMSE estimator instead */
+         /* gain2[i] = pow(g, p) * pow(gfi,1.f-p);*/
+    }
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+
+#if 0
+void preprocess_compute_bark_gain(
+	SpeexPreprocessState * restrict st,
+	int N,
+	int NM
+)
+{
+	register spx_word16_t * restrict gain_floor = st->gain_floor;
+	register spx_word16_t * restrict gain = st->gain;
+	register spx_word16_t * restrict gain2 = st->gain2;
+	register int i;
+
+    for (i=N;i<NM;i++)
+    {
+		register spx_word16_t tmp;
+		register spx_word16_t p = gain2[i];
+		register spx_word16_t gaini;
+		register spx_word16_t gfi = gain_floor[i];
+
+		gaini = MAX16(gain[i], gfi); 
+
+		gain[i] = gaini;
+
+		tmp =	MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(gaini),15))) + 
+			MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(gfi),15)));
+
+		gain2[i]=SQR16_Q15(tmp);
+    }
+
+    filterbank_compute_psd16(st->bank,gain2+N, gain2);
+}
+#endif
+
+void preprocess_apply_gain(
+	SpeexPreprocessState * restrict st,
+	int N
+)
+{
+	register spx_word16_t * restrict ft = st->ft;
+	register spx_word16_t * restrict gain2 = st->gain2;
+	register int j, i;
+
+	ft[0] = MULT16_16_P15(gain2[0],ft[0]);
+
+	for (i=1,j=1; i<N ; i++,j+=2)
+	{
+		register spx_word16_t gain2i = gain2[i];
+		register spx_word16_t ftj = ft[j];
+		register spx_word16_t ftjj = ft[j+1];
+
+		ft[j] = MULT16_16_P15(gain2i,ftj);
+		ft[j+1] = MULT16_16_P15(gain2i,ftjj);
+	}
+
+	ft[(N<<1)-1] = MULT16_16_P15(gain2[N-1],ft[(N<<1)-1]);
+}
+
+#ifdef FIXED_POINT
+void preprocess_scale(
+	SpeexPreprocessState * restrict st,
+	int N
+)
+{
+	register spx_word16_t * restrict frame = st->frame;
+	register int shift = st->frame_shift;
+	register int i;
+	register int N2 = N << 1;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif    
+	for ( i=0 ; i<N2 ;i++)
+	{	register spx_word16_t framei = frame[i];
+		
+		frame[i] = PSHR16(framei,shift);
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+#else
+
+void preprocess_apply_agc(
+	SpeexPreprocessState * restrict st,
+	int N				  
+)
+{				  
+	register spx_word16_t max_sample=0;
+	register spx_word16_t * restrict frame = st->frame;
+	register int i;
+	register int N2 = N << 1;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif    
+	for (i=0;i<N2;i++)
+	{	register spx_word16_t framei = VABS(frame[i]);
+
+		max_sample = VMUX( framei > max_sample, framei, max_sample);
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif    
+
+	if ( max_sample > 28000.f )
+	{
+		float damp = 28000.f/max_sample;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  
+		for ( i=0 ; i< N2 ; i++ )
+		{	frame[i] *= damp;
+		}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif  
+	}
+}
+#endif
+
+
+void preprocess_update(
+	SpeexPreprocessState * restrict st,
+	spx_int16_t * restrict x,
+	int N	
+)
+{
+	register spx_word16_t * restrict frame = st->frame;
+	register spx_word16_t * restrict window = st->window;
+	register spx_word16_t * restrict outbuf = st->outbuf;
+	register int framesize = st->frame_size;
+	register int N2 = N << 1;
+	register int N3 = N2 - framesize;
+	register int N4 = (framesize) - N3;
+	register int i;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  
+	for ( i=0 ; i<N2 ; i++)
+	{	register spx_word16_t fi = frame[i];
+		register spx_word16_t wi = window[i];
+
+		frame[i] = MULT16_16_Q15(fi, wi);
+	}
+	for (i=0;i<N3;i++)
+	{	x[i] = outbuf[i] + frame[i];
+	}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif 
+
+	for ( i=0;i<N4;i++)
+	{	x[N3+i] = frame[N3+i];
+	}
+	
+	memcpy(outbuf, frame+framesize, (N3) * sizeof(spx_word16_t));
+}
+
+#define OVERRIDE_SPEEX_PREPROCESS_RUN
+int speex_preprocess_run(SpeexPreprocessState * restrict st, spx_int16_t * restrict x)
+{
+	register int i, N, M, NM;
+	register spx_word32_t * restrict ps=st->ps;
+	register spx_word32_t Zframe;
+	register spx_word16_t Pframe;
+
+	st->nb_adapt++;
+	st->min_count++;
+	N = st->ps_size;
+	M = st->nbands;
+	NM = N + M;
+
+	preprocess_residue_echo(st, N, NM);
+	preprocess_analysis(st, x);
+	update_noise_prob(st);
+	preprocess_update_noise(st, ps, N);
+
+	if ( st->nb_adapt == 1 )
+	{	memcpy(st->old_ps, ps, (NM) * sizeof(spx_word32_t));
+	}
+
+	preprocess_compute_SNR(st, ps, NM);
+	Zframe = preprocess_smooth_SNR(st, N, NM);
+	
+
+	{
+	register spx_word16_t effective_echo_suppress;
+	
+	Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,M)));
+	effective_echo_suppress =	EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), 
+								MULT16_16(Pframe, st->echo_suppress_active)),15));
+	compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
+    
+	}
+
+	preprocess_compute_emgain(st, ps, Pframe, NM);
+	preprocess_compute_linear_gain(st, ps, N);
+
+   
+	if (!st->denoise_enabled)
+	{
+	   register spx_word16_t * restrict gain2 = st->gain2;
+
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif  
+		for ( i=0 ; i<NM ; i++ )
+		{   gain2[i] = Q15_ONE;
+		}
+#if (TM_UNROLL && TM_UNROLL_SPEEXPREPROCESSRUN)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+	}
+     
+	preprocess_apply_gain(st, N);
+
+#ifndef FIXED_POINT
+	if (st->agc_enabled)
+	{	speex_compute_agc(st, Pframe, st->ft);
+	}
+#endif
+
+
+   spx_ifft(st->fft_lookup, st->ft, st->frame);
+
+#ifdef FIXED_POINT
+	preprocess_scale(st, N);
+#endif
+
+#ifndef FIXED_POINT
+	if ( st->agc_enabled )
+	{	preprocess_apply_agc(st, N);
+	}
+#endif
+
+	preprocess_update(st, x, N);
+
+	if ( st->vad_enabled )
+	{
+		if (Pframe > st->speech_prob_start || (st->was_speech && Pframe > st->speech_prob_continue))
+		{	st->was_speech=1;
+			return 1;
+	  
+		} else
+		{	st->was_speech=0;
+			return 0;
+		}
+	} else 
+	{	return 1;
+	}
+}
diff --git a/tmv/profile_tm.h b/tmv/profile_tm.h
new file mode 100644
index 0000000..8588e26
--- /dev/null
+++ b/tmv/profile_tm.h
@@ -0,0 +1,407 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file profile_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/**
+*	@remarks	This file provide some capabilities to measure clock cycles.
+*				Use this if unable to compile with TriMedia profile options
+*/
+
+extern int __profile_begin;
+extern int __profile_end;
+
+#if TM_PROFILE
+#define	PROFILE_START()												\
+		{															\
+			__profile_begin = cycles();								\
+		}															\
+
+#define PROFILE_STOP()												\
+		{															\
+			__profile_end = cycles();								\
+			printf("%s\t%d\n", __FUNCTION__, end - begin);			\
+		}															\
+
+#else
+#define PROFILE_START()
+#define PROFILE_STOP()
+#endif
+
+#if TM_PROFILE_SPXAUTOCORR
+#define	_SPX_AUTOCORR_START()	PROFILE_START()
+#define _SPX_AUTOCORR_STOP()	PROFILE_STOP()
+#else
+#define _SPX_AUTOCORR_START()
+#define	_SPX_AUTOCORR_STOP()
+#endif
+
+#if TM_PROFILE_INNERPROD
+#define	INNERPROD_START()	PROFILE_START()
+#define INNERPROD_STOP()	PROFILE_STOP()
+#else
+#define INNERPROD_START()
+#define	INNERPROD_STOP()
+#endif
+
+#if TM_PROFILE_PITCHXCORR
+#define	PITCHXCORR_START()	PROFILE_START()
+#define PITCHXCORR_STOP()	PROFILE_STOP()
+#else
+#define PITCHXCORR_START()
+#define	PITCHXCORR_STOP()
+#endif
+
+#if TM_PROFILE_COMPUTEPITCHERROR
+#define COMPUTEPITCHERROR_START()	PROFILE_START()
+#define COMPUTEPITCHERROR_STOP()	PROFILE_STOP()
+#else
+#define COMPUTEPITCHERROR_START()
+#define COMPUTEPITCHERROR_STOP()
+#endif
+
+#if TM_PROFILE_PITCHGAINSEARCH3TAPVQ
+#define PITCHGAINSEARCH3TAPVQ_START()	PROFILE_START()
+#define PITCHGAINSEARCH3TAPVQ_STOP()	PROFILE_STOP()
+#else
+#define PITCHGAINSEARCH3TAPVQ_START()
+#define PITCHGAINSEARCH3TAPVQ_STOP()
+#endif
+
+#if TM_PROFILE_OPENLOOPNBESTPITCH
+#define OPENLOOPNBESTPITCH_START()	PROFILE_START()
+#define OPENLOOPNBESTPITCH_STOP()	PROFILE_STOP()
+#else
+#define OPENLOOPNBESTPITCH_START()
+#define OPENLOOPNBESTPITCH_STOP()
+#endif
+
+
+#if TM_PROFILE_LSP_INTERPOLATE
+#define	LSPINTERPOLATE_START()	PROFILE_START()
+#define LSPINTERPOLATE_STOP()	PROFILE_STOP()
+#else
+#define LSPINTERPOLATE_START()
+#define	LSPINTERPOLATE_STOP()
+#endif
+
+#if TM_PROFILE_CHEBPOLYEVA
+#define	CHEBPOLYEVA_START()	PROFILE_START()
+#define CHEBPOLYEVA_STOP()	PROFILE_STOP()
+#else
+#define CHEBPOLYEVA_START()
+#define	CHEBPOLYEVA_STOP()
+#endif
+
+
+#if TM_PROFILE_COMPUTEQUANTWEIGHTS
+#define	COMPUTEQUANTWEIGHTS_START()	PROFILE_START()
+#define COMPUTEQUANTWEIGHTS_STOP()	PROFILE_STOP()
+#else
+#define COMPUTEQUANTWEIGHTS_START()
+#define	COMPUTEQUANTWEIGHTS_STOP()
+#endif
+
+#if TM_PROFILE_LSPQUANT
+#define	LSPQUANT_START()	PROFILE_START()
+#define LSPQUANT_STOP()		PROFILE_STOP()
+#else
+#define LSPQUANT_START()
+#define	LSPQUANT_STOP()
+#endif
+
+#if TM_PROFILE_LSPWEIGHTQUANT
+#define	LSPWEIGHTQUANT_START()		PROFILE_START()
+#define LSPWEIGHTQUANT_STOP()		PROFILE_STOP()
+#else
+#define LSPWEIGHTQUANT_START()
+#define	LSPWEIGHTQUANT_STOP()
+#endif
+
+#if TM_PROFILE_FIRMEM16
+#define	FIRMEM16_START()	PROFILE_START()
+#define FIRMEM16_STOP()		PROFILE_STOP()
+#else
+#define FIRMEM16_START()
+#define	FIRMEM16_STOP()
+#endif
+
+#if TM_PROFILE_IIRMEM16
+#define	IIRMEM16_START()	PROFILE_START()
+#define IIRMEM16_STOP()		PROFILE_STOP()
+#else
+#define IIRMEM16_START()
+#define	IIRMEM16_STOP()
+#endif
+
+#if TM_PROFILE_FILTERMEM16
+#define	FILTERMEM16_START()		PROFILE_START()
+#define FILTERMEM16_STOP()		PROFILE_STOP()
+#else
+#define FILTERMEM16_START()
+#define	FILTERMEM16_STOP()
+#endif
+
+#if TM_PROFILE_COMPUTERMS16
+#define	COMPUTERMS16_START()	PROFILE_START()
+#define COMPUTERMS16_STOP()		PROFILE_STOP()
+#else
+#define COMPUTERMS16_START()
+#define	COMPUTERMS16_STOP()
+#endif
+	
+#if TM_PROFILE_NORMALIZE16
+#define	NORMALIZE16_START()		PROFILE_START()
+#define NORMALIZE16_STOP()		PROFILE_STOP()
+#else
+#define NORMALIZE16_START()
+#define	NORMALIZE16_STOP()
+#endif
+
+#if TM_PROFILE_BWLPC
+#define	BWLPC_START()		PROFILE_START()
+#define BWLPC_STOP()		PROFILE_STOP()
+#else
+#define BWLPC_START()
+#define BWLPC_STOP()
+#endif
+
+#if TM_PROFILE_HIGHPASS
+#define	HIGHPASS_START()	PROFILE_START()
+#define HIGHPASS_STOP()		PROFILE_STOP()
+#else
+#define HIGHPASS_START()
+#define HIGHPASS_STOP()
+#endif
+
+#if TM_PROFILE_SIGNALMUL
+#define	SIGNALMUL_START()		PROFILE_START()
+#define SIGNALMUL_STOP()		PROFILE_STOP()
+#else
+#define SIGNALMUL_START()
+#define SIGNALMUL_STOP()
+#endif
+
+#if TM_PROFILE_SIGNALDIV
+#define	SIGNALDIV_START()		PROFILE_START()
+#define SIGNALDIV_STOP()		PROFILE_STOP()
+#else
+#define SIGNALDIV_START()
+#define SIGNALDIV_STOP()
+#endif
+
+#if TM_PROFILE_COMPUTEIMPULSERESPONSE
+#define	COMPUTEIMPULSERESPONSE_START()		PROFILE_START()
+#define COMPUTEIMPULSERESPONSE_STOP()		PROFILE_STOP()
+#else
+#define COMPUTEIMPULSERESPONSE_START()
+#define COMPUTEIMPULSERESPONSE_STOP()
+#endif
+
+#if TM_PROFILE_COMPUTEWEIGHTEDCODEBOOK
+#define	COMPUTEWEIGHTEDCODEBOOK_START()			PROFILE_START()
+#define COMPUTEWEIGHTEDCODEBOOK_STOP()			PROFILE_STOP()
+#else
+#define COMPUTEWEIGHTEDCODEBOOK_START()
+#define	COMPUTEWEIGHTEDCODEBOOK_STOP()
+#endif
+
+#if TM_PROFILE_TARGETUPDATE
+#define	TARGETUPDATE_START()			PROFILE_START()
+#define TARGETUPDATE_STOP()				PROFILE_STOP()
+#else
+#define TARGETUPDATE_START()
+#define	TARGETUPDATE_STOP()
+#endif
+
+
+#if TM_PROFILE_VQNBEST
+#define	VQNBEST_START()		PROFILE_START()
+#define VQNBEST_STOP()		PROFILE_STOP()
+#else
+#define VQNBEST_START()
+#define	VQNBEST_STOP()
+#endif
+
+#if TM_PROFILE_VQNBESTSIGN
+#define	VQNBESTSIGN_START()		PROFILE_START()
+#define VQNBESTSIGN_STOP()		PROFILE_STOP()
+#else
+#define VQNBESTSIGN_START()
+#define	VQNBESTSIGN_STOP()
+#endif
+
+#if TM_PROFILE_PREPROCESSANALYSIS
+#define	PREPROCESSANAYLSIS_START()		PROFILE_START()
+#define PREPROCESSANAYLSIS_STOP()		PROFILE_STOP()
+#else
+#define PREPROCESSANAYLSIS_START()
+#define	PREPROCESSANAYLSIS_STOP()
+#endif
+
+#if TM_PROFILE_UPDATENOISEPROB
+#define	UPDATENOISEPROB_START()		PROFILE_START()
+#define UPDATENOISEPROB_STOP()		PROFILE_STOP()
+#else
+#define UPDATENOISEPROB_START()
+#define	UPDATENOISEPROB_STOP()
+#endif
+
+#if TM_PROFILE_COMPUTEGAINFLOOR
+#define	COMPUTEGAINFLOOR_START()	PROFILE_START()
+#define COMPUTEGAINFLOOR_STOP()		PROFILE_STOP()
+#else
+#define COMPUTEGAINFLOOR_START()
+#define	COMPUTEGAINFLOOR_STOP()
+#endif
+
+#if TM_PROFILE_FILTERDCNOTCH16
+#define FILTERDCNOTCH16_START()		PROFILE_START()
+#define FILTERDCNOTCH16_STOP()		PROFILE_STOP()
+#else
+#define FILTERDCNOTCH16_START()
+#define	FILTERDCNOTCH16_STOP()
+#endif
+
+#if TM_PROFILE_MDFINNERPROD
+#define MDFINNERPROD_START()		PROFILE_START()
+#define MDFINNERPROD_STOP()			PROFILE_STOP()
+#else
+#define MDFINNERPROD_START()
+#define	MDFINNERPROD_STOP()
+#endif
+
+#if TM_PROFILE_SPECTRALMULACCUM
+#define SPECTRALMULACCUM_START()		PROFILE_START()
+#define SPECTRALMULACCUM_STOP()			PROFILE_STOP()
+#else
+#define SPECTRALMULACCUM_START()
+#define	SPECTRALMULACCUM_STOP()
+#endif
+
+#if TM_PROFILE_WEIGHTEDSPECTRALMULCONJ
+#define WEIGHTEDSPECTRALMULCONJ_START()		PROFILE_START()
+#define WEIGHTEDSPECTRALMULCONJ_STOP()		PROFILE_STOP()
+#else
+#define WEIGHTEDSPECTRALMULCONJ_START()
+#define	WEIGHTEDSPECTRALMULCONJ_STOP()
+#endif
+
+#if TM_PROFILE_MDFADJUSTPROP
+#define MDFADJUSTPROP_START()		PROFILE_START()
+#define MDFADJUSTPROP_STOP()		PROFILE_STOP()
+#else
+#define MDFADJUSTPROP_START()
+#define	MDFADJUSTPROP_STOP()
+#endif
+
+#if TM_PROFILE_SPEEXECHOGETRESIDUAL
+#define SPEEXECHOGETRESIDUAL_START()	PROFILE_START()
+#define SPEEXECHOGETRESIDUAL_STOP()		PROFILE_STOP()
+#else
+#define SPEEXECHOGETRESIDUAL_START()
+#define	SPEEXECHOGETRESIDUAL_STOP()
+#endif
+
+#if TM_PROFILE_LSPENFORCEMARGIN
+#define LSPENFORCEMARGIN_START()	PROFILE_START()
+#define LSPENFORCEMARGIN_STOP()		PROFILE_STOP()
+#else
+#define LSPENFORCEMARGIN_START()
+#define	LSPENFORCEMARGIN_STOP()
+#endif
+
+#if TM_PROFILE_LSPTOLPC
+#define LSPTOLPC_START()	PROFILE_START()
+#define LSPTOLPC_STOP()		PROFILE_STOP()
+#else
+#define LSPTOLPC_START()
+#define	LSPTOLPC_STOP()
+#endif
+
+#if TM_PROFILE_MAXIMIZERANGE
+#define MAXIMIZERANGE_START()	PROFILE_START()
+#define MAXIMIZERANGE_STOP()	PROFILE_STOP()
+#else
+#define MAXIMIZERANGE_START()
+#define	MAXIMIZERANGE_STOP()
+#endif
+
+#if TM_PROFILE_RENORMRANGE
+#define RENORMRANGE_START()	PROFILE_START()
+#define RENORMRANGE_STOP()	PROFILE_STOP()
+#else
+#define RENORMRANGE_START()
+#define	RENORMRANGE_STOP()
+#endif
+
+#if TM_PROFILE_POWERSPECTRUM
+#define POWERSPECTRUM_START()	PROFILE_START()
+#define POWERSPECTRUM_STOP()	PROFILE_STOP()
+#else
+#define POWERSPECTRUM_START()
+#define	POWERSPECTRUM_STOP()
+#endif
+
+#if TM_PROFILE_QMFSYNTH
+#define QMFSYNTH_START()	PROFILE_START()
+#define QMFSYNTH_STOP()		PROFILE_STOP()
+#else
+#define QMFSYNTH_START()
+#define	QMFSYNTH_STOP()
+#endif
+
+#if TM_PROFILE_QMFDECOMP
+#define QMFDECOMP_START()	PROFILE_START()
+#define QMFDECOMP_STOP()	PROFILE_STOP()
+#else
+#define QMFDECOMP_START()
+#define	QMFDECOMP_STOP()
+#endif
+
+#if TM_PROFILE_FILTERBANKCOMPUTEBANK32
+#define FILTERBANKCOMPUTEBANK32_START()	PROFILE_START()
+#define FILTERBANKCOMPUTEBANK32_STOP()	PROFILE_STOP()
+#else
+#define FILTERBANKCOMPUTEBANK32_START()
+#define	FILTERBANKCOMPUTEBANK32_STOP()
+#endif
+
+#if TM_PROFILE_FILTERBANKCOMPUTEPSD16
+#define FILTERBANKCOMPUTEPSD16_START()	PROFILE_START()
+#define FILTERBANKCOMPUTEPSD16_STOP()	PROFILE_STOP()
+#else
+#define FILTERBANKCOMPUTEPSD16_START()
+#define	FILTERBANKCOMPUTEPSD16_STOP()
+#endif
+
+
diff --git a/tmv/quant_lsp_tm.h b/tmv/quant_lsp_tm.h
new file mode 100644
index 0000000..e704eae
--- /dev/null
+++ b/tmv/quant_lsp_tm.h
@@ -0,0 +1,448 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file quant_lsp_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+#define OVERRIDE_COMPUTE_QUANT_WEIGHTS
+static void compute_quant_weights(Int16 *qlsp, Int16 *qw, int order)
+{
+	int		qlspi, qlspii;
+	int		w1, w2;
+	int		i;
+
+	TMDEBUG_ALIGNMEM(qlsp);
+	TMDEBUG_ALIGNMEM(qw);
+
+	COMPUTEQUANTWEIGHTS_START();
+
+	--order;
+
+	qlspi	= (int)qlsp[0];	
+	qlspii	= (int)qlsp[1];
+	w1		= qlspi;	
+	w2		= qlspii - qlspi;
+
+	qw[0]	= 81920 / (300 + imin(w1,w2));
+
+	for ( i=1 ; i<order ; ++i )
+	{	qlspi	= qlspii;
+		qlspii	= qlsp[i+1];
+
+		w1 = w2;
+		w2 = qlspii - qlspi;
+
+		qw[i] = 81920 / (300 + imin(w1,w2));
+	}
+
+	w1 = LSP_PI - qlspii;
+	qw[i] = 81920 / (300 + imin(w1,w2));
+
+	COMPUTEQUANTWEIGHTS_STOP();
+}
+
+
+
+#define OVERRIDE_LSP_QUANT
+static int lsp_quant(Int16 *x, const signed char *cdbk, int nbVec, int nbDim)
+{
+	register int best_dist=VERY_LARGE32;
+	register int best_id=0;
+	register int i, j;
+	register int dt0, dt1, dt2, dt3;
+	register int cb0, cb1, cb2, cb3, xx;
+	register int ptr_inc = nbDim * 3;
+	register int five = 5;
+    const signed char *ptr;
+
+	TMDEBUG_ALIGNMEM(x);
+
+	LSPQUANT_START();
+
+	for ( i=0, ptr=cdbk ; i<nbVec ; i+=4, ptr += ptr_inc )
+	{	dt3 = dt2 = dt1 = dt0 = 0;
+
+		for ( j=0 ; j <nbDim ; j += 2 )
+		{
+			xx	  =  ld32x(x,j>>1);
+			cb0	  =  pack16lsb((int)ptr[1], (int)ptr[0]);
+			cb0	  =  dualasl(cb0,five);
+			cb0	  =  dspidualsub(xx,cb0);
+			dt0	  += ifir16(cb0,cb0);
+
+			cb1	  =	 pack16lsb((int)ptr[nbDim+1], (int)ptr[nbDim]);
+			cb1	  =  dualasl(cb1,five);
+			cb1   =  dspidualsub(xx,cb1);
+			dt1   += ifir16(cb1, cb1);
+
+			cb2	  =	 pack16lsb((int)ptr[nbDim*2+1], (int)ptr[nbDim*2]);
+			cb2	  =  dualasl(cb2,five);
+			cb2	  =  dspidualsub(xx,cb2);
+			dt2   += ifir16(cb2, cb2);
+
+			cb3	  =	 pack16lsb((int)ptr[nbDim*3+1], (int)ptr[nbDim*3]);
+			cb3	  =  dualasl(cb3,five);
+			cb3	  =  dspidualsub(xx,cb3);
+			dt3   += ifir16(cb3, cb3);
+
+			ptr += 2;
+		}
+
+		if ( dt0<best_dist )
+		{	best_dist	=	dt0;
+			best_id		=	i;
+		}
+
+		if ( dt1<best_dist )
+		{	best_dist	=	dt1;
+			best_id		=	i+1;
+		}
+		
+		if ( dt2<best_dist )
+		{	best_dist	=	dt2;
+			best_id		=	i+2;
+		}
+
+		if ( dt3<best_dist )
+		{	best_dist	=	dt3;
+			best_id		=	i+3;
+		}
+	}
+
+	for ( j=0,ptr=cdbk+best_id*nbDim ; j<nbDim ; j+=2 )
+	{	xx	  =  ld32x(x,j>>1);
+		cb0	  =  pack16lsb((int)ptr[j+1], (int)ptr[j]);
+		cb0	  =  dualasl(cb0,five);
+		dt0	  =  dspidualsub(xx,cb0);
+		st32d(j<<1, x, dt0);
+ 	}
+
+	LSPQUANT_STOP();
+	return best_id;
+}
+
+
+#define OVERRIDE_LSP_WEIGHT_QUANT
+static int lsp_weight_quant(Int16 *x, Int16 *weight, const signed char *cdbk, int nbVec, int nbDim)
+{
+	register int best_dist=VERY_LARGE32;
+	register int best_id=0;
+	register int i, j;
+	register int dt1, dt2, dt3, dt4;
+	register int cb1, cb2, cb3, cb4, wt, xx;
+	register int ptr_inc = nbDim * 3;
+	const signed char *ptr;
+
+	LSPWEIGHTQUANT_START();
+
+	for ( i=0, ptr=cdbk ; i<nbVec ; i+=4, ptr += ptr_inc )
+	{	dt4 = dt3 = dt2 = dt1 = 0;
+			
+		for ( j=0 ; j<nbDim ; ++j )
+		{	wt	  =  weight[j];
+			xx	  =  x[j];
+
+			cb1	  =  xx - (ptr[0]		<< 5);
+			cb2	  =  xx - (ptr[nbDim]	<< 5);
+			cb3	  =  xx - (ptr[nbDim*2] << 5);
+			cb4	  =  xx - (ptr[nbDim*3] << 5);
+
+			++ptr;
+
+			cb1	  *= cb1;
+			cb2	  *= cb2;
+			cb3	  *= cb3;
+			cb4	  *= cb4;
+
+			dt1 += (wt * (cb1 >> 15)) + ((wt * (cb1 & 0x7fff)) >> 15);
+			dt2 += (wt * (cb2 >> 15)) + ((wt * (cb2 & 0x7fff)) >> 15);
+			dt3 += (wt * (cb3 >> 15)) + ((wt * (cb3 & 0x7fff)) >> 15);
+			dt4 += (wt * (cb4 >> 15)) + ((wt * (cb4 & 0x7fff)) >> 15);
+
+		}
+
+		if ( dt1<best_dist )
+		{	best_dist	=	dt1;
+			best_id		=	i;
+		}
+
+		if ( dt2<best_dist )
+		{	best_dist	=	dt2;
+			best_id		=	i+1;
+		}
+		
+		if ( dt3<best_dist )
+		{	best_dist	=	dt3;
+			best_id		=	i+2;
+		}
+
+		if ( dt4<best_dist )
+		{	best_dist	=	dt4;
+			best_id		=	i+3;
+		}
+	}
+
+	for ( j=0 ; j<nbDim ; ++j )
+	{	x[j] = x[j] - ((Int16)cdbk[best_id*nbDim+j] << 5);
+	}
+
+	LSPWEIGHTQUANT_STOP();
+
+	return best_id;
+}
+
+#if 0
+// TODO: unroll loops
+#define OVERRIDE_LSP_QUANT_NB
+void lsp_quant_nb(spx_lsp_t *lsp, spx_lsp_t *qlsp, int order, SpeexBits *bits)
+{
+   int i;
+   int id;
+   spx_word16_t quant_weight[10];
+   
+   for (i=0;i<order;i++)
+      qlsp[i]=lsp[i];
+
+   compute_quant_weights(qlsp, quant_weight, order);
+
+   for (i=0;i<order;i++)
+      qlsp[i]=SUB16(qlsp[i],LSP_LINEAR(i));
+
+#ifndef FIXED_POINT
+   for (i=0;i<order;i++)
+      qlsp[i] = LSP_SCALE*qlsp[i];
+#endif
+   id = lsp_quant(qlsp, cdbk_nb, NB_CDBK_SIZE, order);
+   speex_bits_pack(bits, id, 6);
+
+   for (i=0;i<order;i++)
+      qlsp[i]*=2;
+ 
+   id = lsp_weight_quant(qlsp, quant_weight, cdbk_nb_low1, NB_CDBK_SIZE_LOW1, 5);
+   speex_bits_pack(bits, id, 6);
+
+   for (i=0;i<5;i++)
+      qlsp[i]*=2;
+
+   id = lsp_weight_quant(qlsp, quant_weight, cdbk_nb_low2, NB_CDBK_SIZE_LOW2, 5);
+   speex_bits_pack(bits, id, 6);
+
+   id = lsp_weight_quant(qlsp+5, quant_weight+5, cdbk_nb_high1, NB_CDBK_SIZE_HIGH1, 5);
+   speex_bits_pack(bits, id, 6);
+
+   for (i=5;i<10;i++)
+      qlsp[i]*=2;
+
+   id = lsp_weight_quant(qlsp+5, quant_weight+5, cdbk_nb_high2, NB_CDBK_SIZE_HIGH2, 5);
+   speex_bits_pack(bits, id, 6);
+
+#ifdef FIXED_POINT
+   for (i=0;i<order;i++)
+      qlsp[i]=PSHR16(qlsp[i],2);
+#else
+   for (i=0;i<order;i++)
+      qlsp[i]=qlsp[i] * .00097656;
+#endif
+
+   for (i=0;i<order;i++)
+      qlsp[i]=lsp[i]-qlsp[i];
+}
+
+
+#define OVERRIDE_LSP_UNQUANT_NB
+void lsp_unquant_nb(spx_lsp_t *lsp, int order, SpeexBits *bits)
+{
+   int i, id;
+   for (i=0;i<order;i++)
+      lsp[i]=LSP_LINEAR(i);
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<10;i++)
+      lsp[i] = ADD32(lsp[i], LSP_DIV_256(cdbk_nb[id*10+i]));
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<5;i++)
+      lsp[i] = ADD16(lsp[i], LSP_DIV_512(cdbk_nb_low1[id*5+i]));
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<5;i++)
+      lsp[i] = ADD32(lsp[i], LSP_DIV_1024(cdbk_nb_low2[id*5+i]));
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<5;i++)
+      lsp[i+5] = ADD32(lsp[i+5], LSP_DIV_512(cdbk_nb_high1[id*5+i]));
+   
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<5;i++)
+      lsp[i+5] = ADD32(lsp[i+5], LSP_DIV_1024(cdbk_nb_high2[id*5+i]));
+}
+
+#define OVERRIDE_LSP_QUANT_LBR
+void lsp_quant_lbr(spx_lsp_t *lsp, spx_lsp_t *qlsp, int order, SpeexBits *bits)
+{
+   int i;
+   int id;
+   spx_word16_t quant_weight[10];
+
+   for (i=0;i<order;i++)
+      qlsp[i]=lsp[i];
+
+   compute_quant_weights(qlsp, quant_weight, order);
+
+   for (i=0;i<order;i++)
+      qlsp[i]=SUB16(qlsp[i],LSP_LINEAR(i));
+#ifndef FIXED_POINT
+   for (i=0;i<order;i++)
+      qlsp[i]=qlsp[i]*LSP_SCALE;
+#endif
+   id = lsp_quant(qlsp, cdbk_nb, NB_CDBK_SIZE, order);
+   speex_bits_pack(bits, id, 6);
+   
+   for (i=0;i<order;i++)
+      qlsp[i]*=2;
+   
+   id = lsp_weight_quant(qlsp, quant_weight, cdbk_nb_low1, NB_CDBK_SIZE_LOW1, 5);
+   speex_bits_pack(bits, id, 6);
+
+   id = lsp_weight_quant(qlsp+5, quant_weight+5, cdbk_nb_high1, NB_CDBK_SIZE_HIGH1, 5);
+   speex_bits_pack(bits, id, 6);
+
+#ifdef FIXED_POINT
+   for (i=0;i<order;i++)
+      qlsp[i] = PSHR16(qlsp[i],1);
+#else
+   for (i=0;i<order;i++)
+      qlsp[i] = qlsp[i]*0.0019531;
+#endif
+
+   for (i=0;i<order;i++)
+      qlsp[i]=lsp[i]-qlsp[i];
+}
+
+#define OVERRIDE_LSP_UNQUANT_LBR
+void lsp_unquant_lbr(spx_lsp_t *lsp, int order, SpeexBits *bits)
+{
+   int i, id;
+   for (i=0;i<order;i++)
+      lsp[i]=LSP_LINEAR(i);
+
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<10;i++)
+      lsp[i] += LSP_DIV_256(cdbk_nb[id*10+i]);
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<5;i++)
+      lsp[i] += LSP_DIV_512(cdbk_nb_low1[id*5+i]);
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<5;i++)
+      lsp[i+5] += LSP_DIV_512(cdbk_nb_high1[id*5+i]);
+   
+}
+
+extern const signed char high_lsp_cdbk[];
+extern const signed char high_lsp_cdbk2[];
+
+#define OVERRIDE_LSP_UNQUANT_HIGH
+void lsp_unquant_high(spx_lsp_t *lsp, int order, SpeexBits *bits)
+{
+
+   int i, id;
+   for (i=0;i<order;i++)
+      lsp[i]=LSP_LINEAR_HIGH(i);
+
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<order;i++)
+      lsp[i] += LSP_DIV_256(high_lsp_cdbk[id*order+i]);
+
+
+   id=speex_bits_unpack_unsigned(bits, 6);
+   for (i=0;i<order;i++)
+      lsp[i] += LSP_DIV_512(high_lsp_cdbk2[id*order+i]);
+}
+
+#define OVERRIDE_LSP_QUANT_HIGH
+void lsp_quant_high(spx_lsp_t *lsp, spx_lsp_t *qlsp, int order, SpeexBits *bits)
+{
+   int i;
+   int id;
+   spx_word16_t quant_weight[10];
+
+   for (i=0;i<order;i++)
+      qlsp[i]=lsp[i];
+
+   compute_quant_weights(qlsp, quant_weight, order);
+
+   /*   quant_weight[0] = 10/(qlsp[1]-qlsp[0]);
+   quant_weight[order-1] = 10/(qlsp[order-1]-qlsp[order-2]);
+   for (i=1;i<order-1;i++)
+   {
+      tmp1 = 10/(qlsp[i]-qlsp[i-1]);
+      tmp2 = 10/(qlsp[i+1]-qlsp[i]);
+      quant_weight[i] = tmp1 > tmp2 ? tmp1 : tmp2;
+      }*/
+
+   for (i=0;i<order;i++)
+      qlsp[i]=SUB16(qlsp[i],LSP_LINEAR_HIGH(i));
+#ifndef FIXED_POINT
+   for (i=0;i<order;i++)
+      qlsp[i] = qlsp[i]*LSP_SCALE;
+#endif
+   id = lsp_quant(qlsp, high_lsp_cdbk, 64, order);
+   speex_bits_pack(bits, id, 6);
+
+   for (i=0;i<order;i++)
+      qlsp[i]*=2;
+
+   id = lsp_weight_quant(qlsp, quant_weight, high_lsp_cdbk2, 64, order);
+   speex_bits_pack(bits, id, 6);
+
+#ifdef FIXED_POINT
+   for (i=0;i<order;i++)
+      qlsp[i] = PSHR16(qlsp[i],1);
+#else
+   for (i=0;i<order;i++)
+      qlsp[i] = qlsp[i]*0.0019531;
+#endif
+
+   for (i=0;i<order;i++)
+      qlsp[i]=lsp[i]-qlsp[i];
+}
+#endif
+
+#endif
diff --git a/tmv/speex_config_types.h b/tmv/speex_config_types.h
new file mode 100644
index 0000000..e63303f
--- /dev/null
+++ b/tmv/speex_config_types.h
@@ -0,0 +1,31 @@
+#ifndef __SPEEX_TYPES_H__
+#define __SPEEX_TYPES_H__
+
+#ifdef __TCS__
+
+#include <tmNxTypes.h>
+
+
+
+typedef Int16			spx_int16_t;
+typedef UInt16			spx_uint16_t;
+typedef Int32			spx_int32_t;
+typedef UInt32			spx_uint32_t;
+
+#ifdef FIXED_POINT
+#define VMUX(a,b,c)		mux((a),(b),(c))
+#define VABS(a)			iabs((a))
+#define VMAX(a,b)		imax((a),(b))
+#define VMIN(a,b)		imin((a),(b))
+#else
+#define VMUX(a,b,c)		fmux((a),(b),(c))
+#define VABS(a)			fabs((a))
+#define VMAX(a,b)		fmax((a),(b))
+#define VMIN(a,b)		fmin((a),(b))
+#endif
+
+#endif
+
+
+#endif
+
diff --git a/tmv/vq_tm.h b/tmv/vq_tm.h
new file mode 100644
index 0000000..c8c6109
--- /dev/null
+++ b/tmv/vq_tm.h
@@ -0,0 +1,494 @@
+/* Copyright (C) 2007 Hong Zhiqian */
+/**
+   @file vq_tm.h
+   @author Hong Zhiqian
+   @brief Various compatibility routines for Speex (TriMedia version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <ops/custom_defs.h>
+#include "profile_tm.h"
+
+#ifdef FIXED_POINT
+
+inline void vq_nbest_dist(int i, int N, int dist, int *used, int *nbest, Int32 *best_dist)
+{
+	register int k;
+	
+	if (i<N || dist<best_dist[N-1])
+	{
+		for (k=N-1; (k >= 1) && (k > *used || dist < best_dist[k-1]); k--)
+		{	best_dist[k]=best_dist[k-1];
+			nbest[k] = nbest[k-1];
+		}
+		
+		best_dist[k]=dist;
+		nbest[k]=i;
+		*used++;
+	}
+}
+
+void vq_nbest_5(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	in10, in32, in4;
+	int 			used = 0;
+	
+	in10 = pack16lsb(in[1],in[0]);			/* Note: memory is not align here */
+	in32 = pack16lsb(in[3],in[2]);
+	in4 = sex16(in[4]);	
+
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0,j=0 ; i<entries ; i+=2,j+=5 )
+	{
+		register int dist1, dist2;
+		register int cb10, cb32, cb54, cb76, cb98, cb87, cb65;
+		
+		cb10  =  ld32x(codebook,j);
+		cb32  =  ld32x(codebook,j+1);
+		cb54  =  ld32x(codebook,j+2);
+		cb76  =  ld32x(codebook,j+3);
+		cb98  =  ld32x(codebook,j+4);
+
+		dist1 = sex16(cb54) * in4;
+		dist1 += ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist1 = (E[i] >> 1) - dist1;
+		
+		cb65  =  funshift2(cb76,cb54);
+		cb87  =  funshift2(cb98,cb76);
+		dist2 =  asri(16,cb98) * in4;
+		dist2 += ifir16(in10,cb65) + ifir16(in32,cb87);
+		dist2 = (E[i+1] >> 1) - dist2;
+
+		vq_nbest_dist(i,N,dist1,&used,nbest,best_dist);
+		vq_nbest_dist(i+1,N,dist2,&used,nbest,best_dist);
+	}
+
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+
+void vq_nbest_8(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	in10, in32, in54, in76;
+	int 			used = 0;
+	
+	in10 = pack16lsb(in[1],in[0]);				/* Note: memory is not align here */
+	in32 = pack16lsb(in[3],in[2]);
+	in54 = pack16lsb(in[5],in[4]);	
+	in76 = pack16lsb(in[7],in[6]);	
+
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,j=0 ; i<entries ; ++i,j+=4 )
+	{
+		register int dist;
+		register int cb10, cb32, cb54, cb76;
+		
+		cb10  =  ld32x(codebook,j);
+		cb32  =  ld32x(codebook,j+1);
+		cb54  =  ld32x(codebook,j+2);
+		cb76  =  ld32x(codebook,j+3);
+		
+		dist  =  ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist  += ifir16(in54,cb54) + ifir16(in76,cb76);
+		dist  =  (E[i] >> 1) - dist;
+	  
+		vq_nbest_dist(i,N,dist,&used,nbest,best_dist);
+	}
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+
+}
+
+
+void vq_nbest_10(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	in10, in32, in54, in76, in98;
+	int			 	used = 0;
+	
+	in10 = pack16lsb(in[1],in[0]);
+	in32 = pack16lsb(in[3],in[2]);
+	in54 = pack16lsb(in[5],in[4]);
+	in76 = pack16lsb(in[7],in[6]);
+	in98 = pack16lsb(in[9],in[8]);
+
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,j=0 ; i<entries ; ++i,j+=5 )
+	{
+		register int dist;
+		register int cb10, cb32, cb54, cb76, cb98;
+		
+		cb10  =  ld32x(codebook,j);
+		cb32  =  ld32x(codebook,j+1);
+		cb54  =  ld32x(codebook,j+2);
+		cb76  =  ld32x(codebook,j+3);
+		cb98  =  ld32x(codebook,j+4);
+
+		dist  =  ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist  += ifir16(in54,cb54) + ifir16(in76,cb76);
+		dist  += ifir16(in98,cb98);
+		dist  =  (E[i] >> 1) - dist;
+	  
+		vq_nbest_dist(i,N,dist,&used,nbest,best_dist);	  
+	}
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+void vq_nbest_20(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	in10, in32, in54, in76, in98, in_10, in_32, in_54, in_76, in_98;
+	int 			used = 0;
+	
+	in10  = pack16lsb(in[1],in[0]);					/* Note: memory is not align here */
+	in32  = pack16lsb(in[3],in[2]);
+	in54  = pack16lsb(in[5],in[4]);
+	in76  = pack16lsb(in[6],in[6]);
+	in98  = pack16lsb(in[9],in[8]);
+	in_10 = pack16lsb(in[11],in[10]);
+	in_32 = pack16lsb(in[13],in[12]);
+	in_54 = pack16lsb(in[15],in[14]);
+	in_76 = pack16lsb(in[17],in[16]);
+	in_98 = pack16lsb(in[19],in[18]);
+
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,j=0 ; i<entries ; ++i,j+=10 )
+	{
+		register int dist;
+		register int cb10, cb32, cb54, cb76, cb98, cb_10, cb_32, cb_54, cb_76, cb_98;
+		
+		cb10   =  ld32x(codebook,j);
+		cb32   =  ld32x(codebook,j+1);
+		cb54   =  ld32x(codebook,j+2);
+		cb76   =  ld32x(codebook,j+3);
+		cb98   =  ld32x(codebook,j+4);
+		cb_10  =  ld32x(codebook,j+5);
+		cb_32  =  ld32x(codebook,j+6);
+		cb_54  =  ld32x(codebook,j+7);
+		cb_76  =  ld32x(codebook,j+8);
+		cb_98  =  ld32x(codebook,j+9);
+
+		dist   =  ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist   += ifir16(in54,cb54) + ifir16(in76,cb76);
+		dist   += ifir16(in98,cb98) + ifir16(in_10,cb_10);
+		dist   += ifir16(in_32,cb_32) + ifir16(in_54,cb_54);
+		dist   += ifir16(in_76,cb_76) + ifir16(in_98,cb_98);
+
+		dist   =  (E[i] >> 1) - dist;  
+		vq_nbest_dist(i,N,dist,&used,nbest,best_dist);
+	}
+#if (TM_UNROLL && TM_UNROLL_VQNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+
+#define OVERRIDE_VQ_NBEST
+void vq_nbest (Int16 *in, const Int16 *codebook, int len, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist, char *stack)
+{
+	TMDEBUG_ALIGNMEM(codebook);
+
+	VQNBEST_START();
+	if( len==5 )
+		vq_nbest_5(in,codebook,entries,E,N,nbest,best_dist);
+	else if ( len==8 )
+		vq_nbest_8(in,codebook,entries,E,N,nbest,best_dist);
+	else if ( len==10 )
+		vq_nbest_10(in,codebook,entries,E,N,nbest,best_dist);
+	else if ( len==20 )
+		vq_nbest_20(in,codebook,entries,E,N,nbest,best_dist);
+
+#ifndef REMARK_ON
+	(void)stack;
+#endif
+
+	VQNBEST_STOP();
+}
+
+inline void vq_nbest_sign_dist(int i, int N, int dist, int sign, int entries, int *used, int *nbest, Int32 *best_dist)
+{
+	register int k;
+
+	if (i<N || dist<best_dist[N-1])
+	{	for (k=N-1; (k >= 1) && (k > *used || dist < best_dist[k-1]); k--)
+		{
+			best_dist[k]=best_dist[k-1];
+			nbest[k] = nbest[k-1];
+		}
+		
+		if ( sign ) i += entries;
+		best_dist[k]=dist;
+		*used++;
+		nbest[k] = i;
+	}
+}											
+
+void vq_nbest_sign_5(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int 	i, j;
+	register int	in10, in32, in4;
+	int				used = 0;
+	
+	in10 = ld32(in);
+	in32 = ld32x(in,1);
+	in4	 = sex16(in[4]);
+
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0,j=0 ; i<entries ; i+=2,j+=5 )
+	{
+		register int dist1, dist2, sign1, sign2;
+		register int cb10, cb32, cb54, cb76, cb98, cb87, cb65;
+		
+		cb10  =  ld32x(codebook,j);
+		cb32  =  ld32x(codebook,j+1);
+		cb54  =  ld32x(codebook,j+2);
+		cb76  =  ld32x(codebook,j+3);
+		cb98  =  ld32x(codebook,j+4);
+
+		dist1 = sex16(cb54) * in4;
+		dist1 += ifir16(in10,cb10) + ifir16(in32,cb32);
+		
+		sign1 = mux(dist1>0,0,1);	
+		dist1 = iflip(dist1>0,dist1);
+		dist1 =  (E[i] >> 1) + dist1;  
+		
+		cb65  =  funshift2(cb76,cb54);
+		cb87  =  funshift2(cb98,cb76);
+		dist2 =  asri(16,cb98) * in4;
+		dist2 += ifir16(in10,cb65) + ifir16(in32,cb87);
+		
+		sign2 = mux(dist2>0,0,1);	
+		dist2 = iflip(dist2>0,dist2);
+		dist2 =  (E[i] >> 1) + dist2;  
+		
+		vq_nbest_sign_dist(i,N,dist1,sign1,entries,&used,nbest,best_dist);
+		vq_nbest_sign_dist(i+1,N,dist2,sign2,entries,&used,nbest,best_dist);
+	}
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+void vq_nbest_sign_8(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	sign;
+	register int	in10, in32, in54, in76;
+	int			 	used = 0;
+
+	in10 = ld32(in);
+	in32 = ld32x(in,1);
+	in54 = ld32x(in,2);
+	in76 = ld32x(in,3);
+
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+
+	for ( i=0,j=0 ; i<entries ; ++i,j+=4 )
+	{
+		register int dist;
+		register int cb10, cb32, cb54, cb76;
+		
+		cb10  =  ld32x(codebook,j);
+		cb32  =  ld32x(codebook,j+1);
+		cb54  =  ld32x(codebook,j+2);
+		cb76  =  ld32x(codebook,j+3);
+		
+		dist  =  ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist  += ifir16(in54,cb54) + ifir16(in76,cb76);
+		
+		sign = mux(dist>0,0,1);	
+		dist = iflip(dist>0,dist);
+		dist =  (E[i] >> 1) + dist;  
+      
+	  	vq_nbest_sign_dist(i,N,dist,sign,entries,&used,nbest,best_dist);
+	}
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+void vq_nbest_sign_10(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	sign;
+	register int	in10, in32, in54, in76, in98;
+	int			 	used = 0;
+	
+	in10 = ld32(in);
+	in32 = ld32x(in,1);
+	in54 = ld32x(in,2);
+	in76 = ld32x(in,3);
+	in98 = ld32x(in,4);
+
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unroll=4
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,j=0 ; i<entries ; ++i,j+=5 )
+	{
+		register int dist;
+		register int cb10, cb32, cb54, cb76, cb98;
+		
+		cb10  =  ld32x(codebook,j);
+		cb32  =  ld32x(codebook,j+1);
+		cb54  =  ld32x(codebook,j+2);
+		cb76  =  ld32x(codebook,j+3);
+		cb98  =  ld32x(codebook,j+4);
+		
+		dist  =  ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist  += ifir16(in54,cb54) + ifir16(in76,cb76);
+		dist  += ifir16(in98,cb98);
+		
+		sign = mux(dist>0,0,1);	
+		dist = iflip(dist>0,dist);
+		dist =  (E[i] >> 1) + dist;  
+      
+	  	vq_nbest_sign_dist(i,N,dist,sign,entries,&used,nbest,best_dist);
+	}
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+void vq_nbest_sign_20(Int16 *in, const Int16 *codebook, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist)
+{
+	register int	i, j;
+	register int	sign;
+	register int	in10, in32, in54, in76, in98, in_10, in_32, in_54, in_76, in_98;
+	int			 	used = 0;
+	
+	in10  = ld32(in);
+	in32  = ld32x(in,1);
+	in54  = ld32x(in,2);
+	in76  = ld32x(in,3);
+	in98  = ld32x(in,4);
+	in_10 = ld32x(in,5); 
+	in_32 = ld32x(in,6); 
+	in_54 = ld32x(in,7); 
+	in_76 = ld32x(in,8); 
+	in_98 = ld32x(in,9); 
+
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unroll=2
+#pragma TCS_unrollexact=1
+#endif
+	for ( i=0,j=0 ; i<entries ; ++i,j+=10 )
+	{
+		register int dist;
+		register int cb10, cb32, cb54, cb76, cb98, cb_10, cb_32, cb_54, cb_76, cb_98;
+		
+		cb10   =  ld32x(codebook,j);
+		cb32   =  ld32x(codebook,j+1);
+		cb54   =  ld32x(codebook,j+2);
+		cb76   =  ld32x(codebook,j+3);
+		cb98   =  ld32x(codebook,j+4);
+		cb_10  =  ld32x(codebook,j+5);
+		cb_32  =  ld32x(codebook,j+6);
+		cb_54  =  ld32x(codebook,j+7);
+		cb_76  =  ld32x(codebook,j+8);
+		cb_98  =  ld32x(codebook,j+9);
+
+		dist   =  ifir16(in10,cb10) + ifir16(in32,cb32);
+		dist   += ifir16(in54,cb54) + ifir16(in76,cb76);
+		dist   += ifir16(in98,cb98) + ifir16(in_10,cb_10);
+		dist   += ifir16(in_32,cb_32) + ifir16(in_54,cb_54);
+		dist   += ifir16(in_76,cb_76) + ifir16(in_98,cb_98);
+				
+		sign = mux(dist>0,0,1);	
+		dist = iflip(dist>0,dist);
+		dist =  (E[i] >> 1) + dist;  
+      
+	  	vq_nbest_sign_dist(i,N,dist,sign,entries,&used,nbest,best_dist);
+	}
+#if (TM_UNROLL && TM_UNROLL_VQSIGNNBEST > 0)
+#pragma TCS_unrollexact=0
+#pragma TCS_unroll=0
+#endif
+}
+
+#define OVERRIDE_VQ_NBEST_SIGN
+void vq_nbest_sign (Int16 *in, const Int16 *codebook, int len, int entries, Int32 *E, int N, int *nbest, Int32 *best_dist, char *stack)
+{
+	TMDEBUG_ALIGNMEM(in);
+	TMDEBUG_ALIGNMEM(codebook);
+
+	VQNBESTSIGN_START();
+
+	if( len==5 )
+		vq_nbest_sign_5(in,codebook,entries,E,N,nbest,best_dist);
+	else if ( len==8 )
+		vq_nbest_sign_8(in,codebook,entries,E,N,nbest,best_dist);
+	else if ( len==10 )
+		vq_nbest_sign_10(in,codebook,entries,E,N,nbest,best_dist);
+	else if ( len==20 )
+		vq_nbest_sign_20(in,codebook,entries,E,N,nbest,best_dist);
+
+#ifndef REMARK_ON
+	(void)stack;
+#endif
+
+	VQNBESTSIGN_STOP();
+}
+
+#endif
+
author	jm <jm@0101bb08-14d6-0310-b084-bc0e0c8e3800>	2007-03-15 16:13:25 +0300
committer	jm <jm@0101bb08-14d6-0310-b084-bc0e0c8e3800>	2007-03-15 16:13:25 +0300
commit	4c7f0a5ffd7ea5e52968aad1e89108e124be012b (patch)
tree	0c5858c0c5b1f04b9d20173f254ce47a1f098d72 /tmv
parent	57656c5f88e699a8a6b6af442c736b0ef95cbf3d (diff)