diff options
author | Timothy B. Terriberry <tterribe@xiph.org> | 2013-11-18 22:30:13 +0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-11-18 22:41:17 +0400 |
commit | 39386e0b85ec0f978aa104d312604badb9047d58 (patch) | |
tree | e1171628bb638ec1b770b049e39609d7a268c584 | |
parent | 530198f955e49571b3f890b4da4d933a4cd5df4e (diff) |
Adds Neon assembly for correlation/convolution
Optimizing celt_pitch_xcorr()/xcorr_kernel() which also speeds up
FIRs, IIRs and auto-correlations
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
43 files changed, 1515 insertions, 145 deletions
diff --git a/Makefile.am b/Makefile.am index 20286f84..7a2dca33 100644 --- a/Makefile.am +++ b/Makefile.am @@ -29,6 +29,13 @@ endif if CPU_ARM CELT_SOURCES += $(CELT_SOURCES_ARM) +SILK_SOURCES += $(SILK_SOURCES_ARM) +if OPUS_ARM_EXTERNAL_ASM +CELT_SOURCES += $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) +BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \ + $(CELT_AM_SOURCES_ARM_ASM:.s.in=.s) \ + $(CELT_AM_SOURCES_ARM_ASM.s.in=-gnu.S) +endif endif include celt_headers.mk @@ -106,11 +113,12 @@ endif endif EXTRA_DIST = version.mk \ - opus.pc.in \ + opus.pc.in \ opus-uninstalled.pc.in \ opus.m4 \ Makefile.unix \ tests/run_vectors.sh \ + celt/arm/arm2gnu.pl \ win32/VS2010/silk_float.vcxproj \ win32/VS2010/celt.vcxproj.filters \ win32/VS2010/opus.vcxproj \ @@ -206,3 +214,14 @@ dist-hook: .PHONY: opus check-opus install-opus docs install-docs + +# automake doesn't do dependency tracking for asm files, that I can tell +$(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): celt/arm/armopts-gnu.S +$(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl + +# convert ARM asm to GNU as format +%-gnu.S: $(top_srcdir)/%.s + $(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@ +# For autoconf-modified sources (e.g., armopts.s) +%-gnu.S: %.s + $(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@ diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h index f0c6976e..aefe490e 100644 --- a/celt/_kiss_fft_guts.h +++ b/celt/_kiss_fft_guts.h @@ -94,11 +94,11 @@ do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \ }while(0) -#if defined(ARMv4_ASM) +#if defined(OPUS_ARM_INLINE_ASM) #include "arm/kiss_fft_armv4.h" #endif -#if defined(ARMv5E_ASM) +#if defined(OPUS_ARM_INLINE_EDSP) #include "arm/kiss_fft_armv5e.h" #endif diff --git a/celt/arch.h b/celt/arch.h index 3a2985c3..3bbcd366 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -114,9 +114,9 @@ typedef opus_val32 celt_ener; #include "fixed_generic.h" -#ifdef ARMv5E_ASM +#ifdef OPUS_ARM_INLINE_EDSP #include "arm/fixed_armv5e.h" -#elif defined (ARMv4_ASM) +#elif defined (OPUS_ARM_INLINE_ASM) #include "arm/fixed_armv4.h" #elif defined (BFIN_ASM) #include "fixed_bfin.h" diff --git a/celt/arm/arm2gnu.pl b/celt/arm/arm2gnu.pl new file mode 100644 index 00000000..eab42efa --- /dev/null +++ b/celt/arm/arm2gnu.pl @@ -0,0 +1,316 @@ +#!/usr/bin/perl + +my $bigend; # little/big endian +my $nxstack; + +$nxstack = 0; + +eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' + if $running_under_some_shell; + +while ($ARGV[0] =~ /^-/) { + $_ = shift; + last if /^--/; + if (/^-n/) { + $nflag++; + next; + } + die "I don't recognize this switch: $_\\n"; +} +$printit++ unless $nflag; + +$\ = "\n"; # automatically add newline on print +$n=0; + +$thumb = 0; # ARM mode by default, not Thumb. +@proc_stack = (); + +LINE: +while (<>) { + + # For ADRLs we need to add a new line after the substituted one. + $addPadding = 0; + + # First, we do not dare to touch *anything* inside double quotes, do we? + # Second, if you want a dollar character in the string, + # insert two of them -- that's how ARM C and assembler treat strings. + s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next }; + s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next }; + s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next }; + # If there's nothing on a line but a comment, don't try to apply any further + # substitutions (this is a cheap hack to avoid mucking up the license header) + s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next }; + # If substituted -- leave immediately ! + + s/@/,:/; + s/;/@/; + while ( /@.*'/ ) { + s/(@.*)'/$1/g; + } + s/\{FALSE\}/0/g; + s/\{TRUE\}/1/g; + s/\{(\w\w\w\w+)\}/$1/g; + s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/; + s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/; + s/\bIMPORT\b/.extern/; + s/\bEXPORT\b/.global/; + s/^(\s+)\[/$1IF/; + s/^(\s+)\|/$1ELSE/; + s/^(\s+)\]/$1ENDIF/; + s/IF *:DEF:/ .ifdef/; + s/IF *:LNOT: *:DEF:/ .ifndef/; + s/ELSE/ .else/; + s/ENDIF/ .endif/; + + if( /\bIF\b/ ) { + s/\bIF\b/ .if/; + s/=/==/; + } + if ( $n == 2) { + s/\$/\\/g; + } + if ($n == 1) { + s/\$//g; + s/label//g; + $n = 2; + } + if ( /MACRO/ ) { + s/MACRO *\n/.macro/; + $n=1; + } + if ( /\bMEND\b/ ) { + s/\bMEND\b/.endm/; + $n=0; + } + + # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there. + # + if ( /\bAREA\b/ ) { + my $align; + $align = "2"; + if ( /ALIGN=(\d+)/ ) { + $align = $1; + } + if ( /CODE/ ) { + $nxstack = 1; + } + s/^(.+)CODE(.+)READONLY(.*)/ .text/; + s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata/; + s/^(.+)\|\|\.data\|\|(.+)/ .data/; + s/^(.+)\|\|\.bss\|\|(.+)/ .bss/; + s/$/; .p2align $align/; + # Enable NEON instructions but don't produce a binary that requires + # ARMv7. RVCT does not have equivalent directives, so we just do this + # for all CODE areas. + if ( /.text/ ) { + # Separating .arch, .fpu, etc., by semicolons does not work (gas + # thinks the semicolon is part of the arch name, even when there's + # whitespace separating them). Sadly this means our line numbers + # won't match the original source file (we could use the .line + # directive, which is documented to be obsolete, but then gdb will + # show the wrong line in the translated source file). + s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/; + } + } + + s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3|| + s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2|| + s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2|| + s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/; + s/^(\s+)\%(\s)/ .space $1/; + + s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123 + s/\bCODE32\b/.code 32/ && do {$thumb = 0}; + s/\bCODE16\b/.code 16/ && do {$thumb = 1}; + if (/\bPROC\b/) + { + my $prefix; + my $proc; + /^([A-Za-z_\.]\w+)\b/; + $proc = $1; + $prefix = ""; + if ($proc) + { + $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc); + push(@proc_stack, $proc); + s/^[A-Za-z_\.]\w+/$&:/; + } + $prefix = $prefix."\t.thumb_func; " if ($thumb); + s/\bPROC\b/@ $&/; + $_ = $prefix.$_; + } + s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/; + s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/; + if (/\bENDP\b/) + { + my $proc; + s/\bENDP\b/@ $&/; + $proc = pop(@proc_stack); + $_ = "\t.size $proc, .-$proc".$_ if ($proc); + } + s/\bSUBT\b/@ $&/; + s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25 + s/\bKEEP\b/@ $&/; + s/\bEXPORTAS\b/@ $&/; + s/\|\|(.)+\bEQU\b/@ $&/; + s/\|\|([\w\$]+)\|\|/$1/; + s/\bENTRY\b/@ $&/; + s/\bASSERT\b/@ $&/; + s/\bGBLL\b/@ $&/; + s/\bGBLA\b/@ $&/; + s/^\W+OPT\b/@ $&/; + s/:OR:/|/g; + s/:SHL:/<</g; + s/:SHR:/>>/g; + s/:AND:/&/g; + s/:LAND:/&&/g; + s/CPSR/cpsr/; + s/SPSR/spsr/; + s/ALIGN$/.balign 4/; + s/ALIGN\s+([0-9x]+)$/.balign $1/; + s/psr_cxsf/psr_all/; + s/LTORG/.ltorg/; + s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/; + + # {PC} + 0xdeadfeed --> . + 0xdeadfeed + s/\{PC\} \+/ \. +/; + + # Single hex constant on the line ! + # + # >>> NOTE <<< + # Double-precision floats in gcc are always mixed-endian, which means + # bytes in two words are little-endian, but words are big-endian. + # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address + # and 0xfeed0000 at high address. + # + s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! + s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/; + + # Single hex constant on the line ! +# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! +# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/; + s/\bDCFS[ \t]+0x/.word 0x/; + s/\bDCFS\b/.float/; + + s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/; + s/\bDCD\b/.word/; + s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/; + s/\bDCW\b/.short/; + s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/; + s/\bDCB\b/.byte/; + s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/; + s/^[A-Za-z_\.]\w+/$&:/; + s/^(\d+)/$1:/; + s/\%(\d+)/$1b_or_f/; + s/\%[Bb](\d+)/$1b/; + s/\%[Ff](\d+)/$1f/; + s/\%[Ff][Tt](\d+)/$1f/; + s/&([\dA-Fa-f]+)/0x$1/; + if ( /\b2_[01]+\b/ ) { + s/\b2_([01]+)\b/conv$1&&&&/g; + while ( /[01][01][01][01]&&&&/ ) { + s/0000&&&&/&&&&0/g; + s/0001&&&&/&&&&1/g; + s/0010&&&&/&&&&2/g; + s/0011&&&&/&&&&3/g; + s/0100&&&&/&&&&4/g; + s/0101&&&&/&&&&5/g; + s/0110&&&&/&&&&6/g; + s/0111&&&&/&&&&7/g; + s/1000&&&&/&&&&8/g; + s/1001&&&&/&&&&9/g; + s/1010&&&&/&&&&A/g; + s/1011&&&&/&&&&B/g; + s/1100&&&&/&&&&C/g; + s/1101&&&&/&&&&D/g; + s/1110&&&&/&&&&E/g; + s/1111&&&&/&&&&F/g; + } + s/000&&&&/&&&&0/g; + s/001&&&&/&&&&1/g; + s/010&&&&/&&&&2/g; + s/011&&&&/&&&&3/g; + s/100&&&&/&&&&4/g; + s/101&&&&/&&&&5/g; + s/110&&&&/&&&&6/g; + s/111&&&&/&&&&7/g; + s/00&&&&/&&&&0/g; + s/01&&&&/&&&&1/g; + s/10&&&&/&&&&2/g; + s/11&&&&/&&&&3/g; + s/0&&&&/&&&&0/g; + s/1&&&&/&&&&1/g; + s/conv&&&&/0x/g; + } + + if ( /commandline/) + { + if( /-bigend/) + { + $bigend=1; + } + } + + if ( /\bDCDU\b/ ) + { + my $cmd=$_; + my $value; + my $prefix; + my $w1; + my $w2; + my $w3; + my $w4; + + s/\s+DCDU\b/@ $&/; + + $cmd =~ /\bDCDU\b\s+0x(\d+)/; + $value = $1; + $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/; + $w1 = $1; + $w2 = $2; + $w3 = $3; + $w4 = $4; + + if( $bigend ne "") + { + # big endian + $prefix = "\t.byte\t0x".$w1.";". + "\t.byte\t0x".$w2.";". + "\t.byte\t0x".$w3.";". + "\t.byte\t0x".$w4."; "; + } + else + { + # little endian + $prefix = "\t.byte\t0x".$w4.";". + "\t.byte\t0x".$w3.";". + "\t.byte\t0x".$w2.";". + "\t.byte\t0x".$w1."; "; + } + $_=$prefix.$_; + } + + if ( /\badrl\b/i ) + { + s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i; + $addPadding = 1; + } + s/\bEND\b/@ END/; +} continue { + printf ("%s", $_) if $printit; + if ($addPadding != 0) + { + printf (" mov r0,r0\n"); + $addPadding = 0; + } +} +#If we had a code section, mark that this object doesn't need an executable +# stack. +if ($nxstack) { + printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n"); +} diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c new file mode 100644 index 00000000..547a84d1 --- /dev/null +++ b/celt/arm/arm_celt_map.c @@ -0,0 +1,49 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pitch.h" + +#if defined(OPUS_HAVE_RTCD) + +# if defined(FIXED_POINT) +opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int , int) = { + celt_pitch_xcorr_c, /* ARMv4 */ + MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ + MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ + MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ +}; +# else +# error "Floating-point implementation is not supported by ARM asm yet." \ + "Reconfigure with --disable-rtcd or send patches." +# endif + +#endif diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c index 5fe16025..17685258 100644 --- a/celt/arm/armcpu.c +++ b/celt/arm/armcpu.c @@ -55,7 +55,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit * instructions via their assembled hex code. * All of these instructions should be essentially nops. */ -# if defined(ARMv5E_ASM) +# if defined(OPUS_ARM_MAY_HAVE_EDSP) __try{ /*PLD [r13]*/ __emit(0xF5DDF000); @@ -64,7 +64,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(ARMv6E_ASM) +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) __try{ /*SHADD8 r3,r3,r3*/ __emit(0xE6333F93); @@ -73,7 +73,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(ARM_HAVE_NEON) +# if defined(OPUS_ARM_MAY_HAVE_NEON) __try{ /*VORR q0,q0,q0*/ __emit(0xF2200150); @@ -107,19 +107,26 @@ opus_uint32 opus_cpu_capabilities(void) while(fgets(buf, 512, cpuinfo) != NULL) { +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) /* Search for edsp and neon flag */ if(memcmp(buf, "Features", 8) == 0) { char *p; +# if defined(OPUS_ARM_MAY_HAVE_EDSP) p = strstr(buf, " edsp"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_EDSP; +# endif +# if defined(OPUS_ARM_MAY_HAVE_NEON) p = strstr(buf, " neon"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_NEON; +# endif } +# endif +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) /* Search for media capabilities (>= ARMv6) */ if(memcmp(buf, "CPU architecture:", 17) == 0) { @@ -129,6 +136,7 @@ opus_uint32 opus_cpu_capabilities(void) if(version >= 6) flags |= OPUS_CPU_ARM_MEDIA; } +# endif } fclose(cpuinfo); diff --git a/celt/arm/armcpu.h b/celt/arm/armcpu.h index 68d80fe2..ac574460 100644 --- a/celt/arm/armcpu.h +++ b/celt/arm/armcpu.h @@ -25,11 +25,47 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* Original code from libtheora modified to suit to Opus */ +#if !defined(ARMCPU_H) +# define ARMCPU_H -#ifndef ARMCPU_H -#define ARMCPU_H +# if defined(OPUS_ARM_MAY_HAVE_EDSP) +# define MAY_HAVE_EDSP(name) name ## _edsp +# else +# define MAY_HAVE_EDSP(name) name ## _c +# endif +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) +# define MAY_HAVE_MEDIA(name) name ## _media +# else +# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name) +# endif + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +# define MAY_HAVE_NEON(name) name ## _neon +# else +# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name) +# endif + +# if defined(OPUS_ARM_PRESUME_EDSP) +# define PRESUME_EDSP(name) name ## _edsp +# else +# define PRESUME_EDSP(name) name ## _c +# endif + +# if defined(OPUS_ARM_PRESUME_MEDIA) +# define PRESUME_MEDIA(name) name ## _media +# else +# define PRESUME_MEDIA(name) PRESUME_EDSP(name) +# endif + +# if defined(OPUS_ARM_PRESUME_NEON) +# define PRESUME_NEON(name) name ## _neon +# else +# define PRESUME_NEON(name) PRESUME_MEDIA(name) +# endif + +# if defined(OPUS_HAVE_RTCD) int opus_select_arch(void); +# endif #endif diff --git a/celt/arm/armopts.s.in b/celt/arm/armopts.s.in new file mode 100644 index 00000000..3d8aaf27 --- /dev/null +++ b/celt/arm/armopts.s.in @@ -0,0 +1,37 @@ +/* Copyright (C) 2013 Mozilla Corporation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +; Set the following to 1 if we have EDSP instructions +; (LDRD/STRD, etc., ARMv5E and later). +OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@ + +; Set the following to 1 if we have ARMv6 media instructions. +OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@ + +; Set the following to 1 if we have NEON (some ARMv7) +OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@ + +END diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s new file mode 100644 index 00000000..2db681d2 --- /dev/null +++ b/celt/arm/celt_pitch_xcorr_arm.s @@ -0,0 +1,598 @@ +; Copyright (c) 2007-2008 CSIRO +; Copyright (c) 2007-2009 Xiph.Org Foundation +; Copyright (c) 2013 Parrot +; Written by Aurélien Zanelli +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + AREA |.text|, CODE, READONLY + + GET celt/arm/armopts.s + +IF OPUS_ARM_MAY_HAVE_EDSP + EXPORT celt_pitch_xcorr_edsp +ENDIF + +IF OPUS_ARM_MAY_HAVE_NEON + EXPORT celt_pitch_xcorr_neon +ENDIF + +IF OPUS_ARM_MAY_HAVE_NEON + +;; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 +;xcorr_kernel_neon PROC +; ; input: +; ; r3 = int len +; ; r4 = opus_val16 *x +; ; r5 = opus_val16 *y +; ; q0 = opus_val32 sum[4] +; ; output: +; ; q0 = opus_val32 sum[4] +; ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 +; ; internal usage: +; ; r12 = int j +; ; d3 = y_3|y_2|y_1|y_0 +; ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 +; ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 +; ; q8 = scratch +; ; +; ; Load y[0...3] +; ; This requires len>0 to always be valid (which we assert in the C code). +; VLD1.16 {d5}, [r5]! +; SUBS r12, r3, #8 +; BLE xcorr_kernel_neon_process4 +;; Process 8 samples at a time. +;; This loop loads one y value more than we actually need. Therefore we have to +;; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid +;; reading past the end of the array. +;xcorr_kernel_neon_process8 +; ; This loop has 19 total instructions (10 cycles to issue, minimum), with +; ; - 2 cycles of ARM insrtuctions, +; ; - 10 cycles of load/store/byte permute instructions, and +; ; - 9 cycles of data processing instructions. +; ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the +; ; latter two categories, meaning the whole loop should run in 10 cycles per +; ; iteration, barring cache misses. +; ; +; ; Load x[0...7] +; VLD1.16 {d6, d7}, [r4]! +; ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get +; ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. +; VAND d3, d5, d5 +; SUBS r12, r12, #8 +; ; Load y[4...11] +; VLD1.16 {d4, d5}, [r5]! +; VMLAL.S16 q0, d3, d6[0] +; VEXT.16 d16, d3, d4, #1 +; VMLAL.S16 q0, d4, d7[0] +; VEXT.16 d17, d4, d5, #1 +; VMLAL.S16 q0, d16, d6[1] +; VEXT.16 d16, d3, d4, #2 +; VMLAL.S16 q0, d17, d7[1] +; VEXT.16 d17, d4, d5, #2 +; VMLAL.S16 q0, d16, d6[2] +; VEXT.16 d16, d3, d4, #3 +; VMLAL.S16 q0, d17, d7[2] +; VEXT.16 d17, d4, d5, #3 +; VMLAL.S16 q0, d16, d6[3] +; VMLAL.S16 q0, d17, d7[3] +; BGT xcorr_kernel_neon_process8 +;; Process 4 samples here if we have > 4 left (still reading one extra y value). +;xcorr_kernel_neon_process4 +; ADDS r12, r12, #4 +; BLE xcorr_kernel_neon_process2 +; ; Load x[0...3] +; VLD1.16 d6, [r4]! +; ; Use VAND since it's a data processing instruction again. +; VAND d4, d5, d5 +; SUB r12, r12, #4 +; ; Load y[4...7] +; VLD1.16 d5, [r5]! +; VMLAL.S16 q0, d4, d6[0] +; VEXT.16 d16, d4, d5, #1 +; VMLAL.S16 q0, d16, d6[1] +; VEXT.16 d16, d4, d5, #2 +; VMLAL.S16 q0, d16, d6[2] +; VEXT.16 d16, d4, d5, #3 +; VMLAL.S16 q0, d16, d6[3] +;; Process 2 samples here if we have > 2 left (still reading one extra y value). +;xcorr_kernel_neon_process2 +; ADDS r12, r12, #2 +; BLE xcorr_kernel_neon_process1 +; ; Load x[0...1] +; VLD2.16 {d6[],d7[]}, [r4]! +; ; Use VAND since it's a data processing instruction again. +; VAND d4, d5, d5 +; SUB r12, r12, #2 +; ; Load y[4...5] +; VLD1.32 {d5[]}, [r5]! +; VMLAL.S16 q0, d4, d6 +; VEXT.16 d16, d4, d5, #1 +; ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI +; ; instead of VEXT, since it's a data-processing instruction. +; VSRI.64 d5, d4, #32 +; VMLAL.S16 q0, d16, d7 +;; Process 1 sample using the extra y value we loaded above. +;xcorr_kernel_neon_process1 +; ; Load next *x +; VLD1.16 {d6[]}, [r4]! +; ADDS r12, r12, #1 +; ; y[0...3] are left in d5 from prior iteration(s) (if any) +; VMLAL.S16 q0, d5, d6 +; MOVLE pc, lr +;; Now process 1 last sample, not reading ahead. +; ; Load last *y +; VLD1.16 {d4[]}, [r5]! +; VSRI.64 d4, d5, #16 +; ; Load last *x +; VLD1.16 {d6[]}, [r4]! +; VMLAL.S16 q0, d4, d6 +; MOV pc, lr +; ENDP + +;; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, +;; opus_val32 *xcorr, int len, int max_pitch) +;celt_pitch_xcorr_neon PROC +; ; input: +; ; r0 = opus_val16 *_x +; ; r1 = opus_val16 *_y +; ; r2 = opus_val32 *xcorr +; ; r3 = int len +; ; output: +; ; r0 = int maxcorr +; ; internal usage: +; ; r4 = opus_val16 *x (for xcorr_kernel_neon()) +; ; r5 = opus_val16 *y (for xcorr_kernel_neon()) +; ; r6 = int max_pitch +; ; r12 = int j +; ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) +; STMFD sp!, {r4-r6, lr} +; LDR r6, [sp, #16] +; VMOV.S32 q15, #1 +; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done +; SUBS r6, r6, #4 +; BLT celt_pitch_xcorr_neon_process4_done +;celt_pitch_xcorr_neon_process4 +; ; xcorr_kernel_neon parameters: +; ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} +; MOV r4, r0 +; MOV r5, r1 +; VEOR q0, q0, q0 +; ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. +; ; So we don't save/restore any other registers. +; BL xcorr_kernel_neon +; SUBS r6, r6, #4 +; VST1.32 {q0}, [r2]! +; ; _y += 4 +; ADD r1, r1, #8 +; VMAX.S32 q15, q15, q0 +; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done +; BGE celt_pitch_xcorr_neon_process4 +;; We have less than 4 sums left to compute. +;celt_pitch_xcorr_neon_process4_done +; ADDS r6, r6, #4 +; ; Reduce maxcorr to a single value +; VMAX.S32 d30, d30, d31 +; VPMAX.S32 d30, d30, d30 +; ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done +; BLE celt_pitch_xcorr_neon_done +;; Now compute each remaining sum one at a time. +;celt_pitch_xcorr_neon_process_remaining +; MOV r4, r0 +; MOV r5, r1 +; VMOV.I32 q0, #0 +; SUBS r12, r3, #8 +; BLT celt_pitch_xcorr_neon_process_remaining4 +;; Sum terms 8 at a time. +;celt_pitch_xcorr_neon_process_remaining_loop8 +; ; Load x[0...7] +; VLD1.16 {q1}, [r4]! +; ; Load y[0...7] +; VLD1.16 {q2}, [r5]! +; SUBS r12, r12, #8 +; VMLAL.S16 q0, d4, d2 +; VMLAL.S16 q0, d5, d3 +; BGE celt_pitch_xcorr_neon_process_remaining_loop8 +;; Sum terms 4 at a time. +;celt_pitch_xcorr_neon_process_remaining4 +; ADDS r12, r12, #4 +; BLT celt_pitch_xcorr_neon_process_remaining4_done +; ; Load x[0...3] +; VLD1.16 {d2}, [r4]! +; ; Load y[0...3] +; VLD1.16 {d3}, [r5]! +; SUB r12, r12, #4 +; VMLAL.S16 q0, d3, d2 +; ; Reduce the sum to a single value. +; VADD.S32 d0, d0, d1 +; VPADDL.S32 d0, d0 +;celt_pitch_xcorr_neon_process_remaining4_done +; ADDS r12, r12, #4 +; BLE celt_pitch_xcorr_neon_process_remaining_loop_done +;; Sum terms 1 at a time. +;celt_pitch_xcorr_neon_process_remaining_loop1 +; VLD1.16 {d2[]}, [r4]! +; VLD1.16 {d3[]}, [r5]! +; SUBS r12, r12, #1 +; VMLAL.S16 q0, d2, d3 +; BGT celt_pitch_xcorr_neon_process_remaining_loop1 +;celt_pitch_xcorr_neon_process_remaining_loop_done +; VST1.32 {d0[0]}, [r2]! +; VMAX.S32 d30, d30, d0 +; SUBS r6, r6, #1 +; ; _y++ +; ADD r1, r1, #2 +; ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining +; BGT celt_pitch_xcorr_neon_process_remaining +;celt_pitch_xcorr_neon_done +; VMOV.32 r0, d30[0] +; LDMFD sp!, {r4-r6, pc} +; ENDP + +xcorr_kernel_neon PROC + ; input: + ; r0 = opus_val16 *x + ; r1 = opus_val16 *y + ; r2 = int len + ; q0 = opus_val32 sum (sum[3] | sum[2] | sum[1] | sum[0]) + + ; output: + ; q0 = sum + + ; internal usage: + ; r3 = j + ; d2 = x_3|x_2|x_1|x_0 d3 = y_3|y_2|y_1|y_0 + ; d4 = y_7|y_6|y_5|y_4 d5 = y_4|y_3|y_2|y_1 + ; d6 = y_5|y_4|y_3|y_2 d7 = y_6|y_5|y_4|y_3 + ; We will build d5, d6 and d7 vector from d3 and d4 + + + VLD1.16 {d3}, [r1]! ; Load y[3] downto y[0] to d3 lane (yy0) + SUB r3, r2, #1 + MOVS r3, r3, lsr #2 ; j=(len-1)>>2 + BEQ xcorr_kernel_neon_process4_done + + ; Process 4 x samples at a time + ; For this, we will need 4 y vectors +xcorr_kernel_neon_process4 + SUBS r3, r3, #1 ; j-- + VLD1.16 d4, [r1]! ; Load y[7] downto y[4] to d4 lane + VLD1.16 d2, [r0]! ; Load x[3] downto x[0] to d2 lane + VEXT.16 d5, d3, d4, #1 ; Build y[4] downto y[1] vector (yy1) + VEXT.16 d6, d3, d4, #2 ; Build y[5] downto y[2] vector (yy2) + VEXT.16 d7, d3, d4, #3 ; Build y[6] downto y[3] vector (yy3) + + VMLAL.S16 q0, d3, d2[0] ; MAC16_16(sum, x[0], yy0) + VMLAL.S16 q0, d5, d2[1] ; MAC16_16(sum, x[1], yy1) + VMLAL.S16 q0, d6, d2[2] ; MAC16_16(sum, x[2], yy2) + VMLAL.S16 q0, d7, d2[3] ; MAC16_16(sum, x[3], yy3) + + VMOV.S16 d3, d4 ; Next y vector should be in d3 (yy0) + + BNE xcorr_kernel_neon_process4 + +xcorr_kernel_neon_process4_done + ;Process len-1 to len + VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane + + SUB r3, r2, #1 + ANDS r3, r3, #3 ; j=(len-1)&3 + VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0) + BEQ xcorr_kernel_neon_done + +xcorr_kernel_neon_process_remaining + SUBS r3, r3, #1 ; j-- + VLD1.16 {d4[]}, [r1]! ; Load y value and duplicate to d4 lane + VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane + VEXT.16 d3, d3, d4, #1 ; Build y vector from previous and d4 + VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0) + BNE xcorr_kernel_neon_process_remaining + +xcorr_kernel_neon_done + MOV pc, lr + ENDP + +celt_pitch_xcorr_neon PROC + ; input: + ; r0 = opus_val16 *_x + ; r1 = opus_val16 *_y + ; r2 = opus_val32 *xcorr + ; r3 = int len + + ; output: + ; r0 = maxcorr + + STMFD sp!, {r4-r9, lr} + + LDR r4, [sp, #28] ; r4 = int max_pitch + MOV r5, r0 ; r5 = _x + MOV r6, r1 ; r6 = _y + MOV r7, r2 ; r7 = xcorr + MOV r2, r3 ; r2 = len + + VMOV.S32 d16, #1 ; d16 = {1, 1} (not used by xcorr_kernel_neon) + MOV r8, #0 ; r8 = i = 0 + CMP r4, #3 ; max_pitch-3 <= 0 ---> pitch_xcorr_neon_process4_done + BLE celt_pitch_xcorr_neon_process4_done + + SUB r9, r4, #3 ; r9 = max_pitch-3 + +celt_pitch_xcorr_neon_process4 + MOV r0, r5 ; r0 = _x + ADD r1, r6 ,r8, LSL #1 ; r1 = _y + i + VMOV.I32 q0, #0 ; q0 = opus_val32 sum[4] = {0, 0, 0, 0} + + ; xcorr_kernel_neon don't touch r2 (len) + ; So we don't store it + BL xcorr_kernel_neon ; xcorr_kernel_neon(_x, _y+i, sum, len) + + VST1.32 {q0}, [r7]! ; Store sum to xcorr + VPMAX.S32 d0, d0, d1 ; d0 = max(sum[3], sum[2]) | max(sum[1], sum[0]) + ADD r8, r8, #4 ; i+=4 + VPMAX.S32 d0, d0, d0 ; d0 = max(sum[3], sum[2], sum[1], sum[0]) + CMP r8, r9 ; i < max_pitch-3 ----> pitch_xcorr_neon_process4 + VMAX.S32 d16, d16, d0 ; d16 = maxcorr = max(maxcorr, sum) + + BLT celt_pitch_xcorr_neon_process4 + +celt_pitch_xcorr_neon_process4_done + CMP r8, r4; + BGE celt_pitch_xcorr_neon_done + +celt_pitch_xcorr_neon_process_remaining + MOV r0, r5 ; r0 = _x + ADD r1, r6, r8, LSL #1 ; r1 = _y + i + VMOV.I32 q0, #0 + MOVS r3, r2, LSR #2 ; r3 = j = len + BEQ inner_loop_neon_process4_done + +inner_loop_neon_process4 + VLD1.16 {d2}, [r0]! ; Load x + VLD1.16 {d3}, [r1]! ; Load y + SUBS r3, r3, #1 + VMLAL.S16 q0, d2, d3 + BNE inner_loop_neon_process4 + + VPADD.S32 d0, d0, d1 ; Reduce sum + VPADD.S32 d0, d0, d0 + +inner_loop_neon_process4_done + ANDS r3, r2, #3 + BEQ inner_loop_neon_done + +inner_loop_neon_process_remaining + VLD1.16 {d2[]}, [r0]! + VLD1.16 {d3[]}, [r1]! + SUBS r3, r3, #1 + VMLAL.S16 q0, d2, d3 + BNE inner_loop_neon_process_remaining + +inner_loop_neon_done + VST1.32 {d0[0]}, [r7]! + VMAX.S32 d16, d16, d0 + + ADD r8, r8, #1 + CMP r8, r4 + BCC celt_pitch_xcorr_neon_process_remaining + +celt_pitch_xcorr_neon_done + VMOV d0, d16 + VMOV.32 r0, d0[0] + LDMFD sp!, {r4-r9, pc} + ENDP + + +ENDIF + +IF OPUS_ARM_MAY_HAVE_EDSP + +; This will get used on ARMv7 devices without NEON, so it has been optimized +; to take advantage of dual-issuing where possible. +xcorr_kernel_edsp PROC + ; input: + ; r3 = int len + ; r4 = opus_val16 *_x + ; r5 = opus_val16 *_y + ; r6...r9 = opus_val32 sum[4] + ; output: + ; r6...r9 = opus_val32 sum[4] + ; preserved: r0-r5 + ; internal usage + ; r2 = int j + ; r12,r14 = opus_val16 x[4] + ; r10,r11 = opus_val16 y[4] + STMFD sp!, {r2,r4,r5,lr} + SUBS r2, r3, #4 ; j = len-4 + LDRD r10, r11, [r5], #8 ; Load y[0...3] + BLE xcorr_kernel_edsp_process4_done + LDR r12, [r4], #4 ; Load x[0...1] + ; Stall +xcorr_kernel_edsp_process4 + ; The multiplies must issue from pipeline 0, and can't dual-issue with each + ; other. Every other instruction here dual-issues with a multiply, and is + ; thus "free". There should be no stalls in the body of the loop. + SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) + LDR r14, [r4], #4 ; Load x[2...3] + SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) + SUBS r2, r2, #4 ; j-=4 + SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) + SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) + SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) + LDR r10, [r5], #4 ; Load y[4...5] + SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) + SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) + SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) + LDRGT r12, [r4], #4 ; Load x[0...1] + SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) + SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) + SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) + SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) + SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) + LDR r11, [r5], #4 ; Load y[6...7] + SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) + SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) + SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) + BGT xcorr_kernel_edsp_process4 +xcorr_kernel_edsp_process4_done + ADDS r2, r2, #4 + BLE xcorr_kernel_edsp_done + LDRH r12, [r4], #2 ; r12 = *x++ + SUBS r2, r2, #1 ; j-- + ; Stall + SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) + LDRGTH r14, [r4], #2 ; r14 = *x++ + SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) + SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) + SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) + BLE xcorr_kernel_edsp_done + SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) + SUBS r2, r2, #1 ; j-- + SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) + LDRH r10, [r5], #2 ; r10 = y_4 = *y++ + SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) + LDRGTH r12, [r4], #2 ; r12 = *x++ + SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) + BLE xcorr_kernel_edsp_done + SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) + CMP r2, #1 ; j-- + SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) + LDRH r2, [r5], #2 ; r2 = y_5 = *y++ + SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) + LDRGTH r14, [r4] ; r14 = *x + SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) + BLE xcorr_kernel_edsp_done + SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) + LDRH r11, [r5] ; r11 = y_6 = *y + SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) + SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) + SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) +xcorr_kernel_edsp_done + LDMFD sp!, {r2,r4,r5,pc} + ENDP + +celt_pitch_xcorr_edsp PROC + ; input: + ; r0 = opus_val16 *_x + ; r1 = opus_val16 *_y + ; r2 = opus_val32 *xcorr + ; r3 = int len + ; output: + ; r0 = maxcorr + ; internal usage + ; r4 = opus_val16 *x + ; r5 = opus_val16 *y + ; r6 = opus_val32 sum0 + ; r7 = opus_val32 sum1 + ; r8 = opus_val32 sum2 + ; r9 = opus_val32 sum3 + ; r1 = int max_pitch + ; r12 = int j + STMFD sp!, {r4-r11, lr} + MOV r5, r1 + LDR r1, [sp, #36] + MOV r4, r0 + ; maxcorr = 1 + MOV r0, #1 + ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done + SUBS r1, r1, #4 + BLT celt_pitch_xcorr_edsp_process4_done +celt_pitch_xcorr_edsp_process4 + ; xcorr_kernel_edsp parameters: + ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} + MOV r6, #0 + MOV r7, #0 + MOV r8, #0 + MOV r9, #0 + BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) + ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) + CMP r0, r6 + ; _y+=4 + ADD r5, r5, #8 + MOVLT r0, r6 + CMP r0, r7 + STRD r6, r7, [r2], #8 + MOVLT r0, r7 + CMP r0, r8 + STRD r8, r9, [r2], #8 + MOVLT r0, r8 + CMP r0, r9 + MOVLT r0, r9 + SUBS r1, r1, #4 + BGE celt_pitch_xcorr_edsp_process4 +celt_pitch_xcorr_edsp_process4_done + ADDS r1, r1, #4 + BLE celt_pitch_xcorr_edsp_done +; Now compute each remaining sum one at a time. +celt_pitch_xcorr_edsp_process_remaining + SUBS r12, r3, #4 + ; r14 = sum = 0 + MOV r14, #0 + BLT celt_pitch_xcorr_edsp_process_remaining_loop_done + LDRD r6, r7, [r4], #8 + LDRD r8, r9, [r5], #8 + ; Stall +celt_pitch_xcorr_edsp_process_remaining_loop4 + SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBS r12, r12, #4 ; j-- + SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) + LDRGE r6, [r4], #4 + SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) + LDRGE r8, [r5], #4 + SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) + LDRGE r7, [r4], #4 + LDRGE r9, [r5], #4 + BGE celt_pitch_xcorr_edsp_process_remaining_loop4 +celt_pitch_xcorr_edsp_process_remaining_loop_done + ADDS r12, r12, #2 + LDRGE r6, [r4], #4 + LDRGE r8, [r5], #4 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBGE r12, r12, #2 + SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) + ADDS r12, r12, #1 + LDRGEH r6, [r4], #2 + LDRGEH r8, [r5], #2 + ; Restore _x + SUB r4, r4, r3, LSL #1 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) + ; Restore and advance _y + SUB r5, r5, r3, LSL #1 + ; maxcorr = max(maxcorr, sum) + ; Stall + CMP r0, r14 + ADD r5, r5, #2 + MOVLT r0, r14 + SUBS r1, r1, #1 + ; xcorr[i] = sum + STR r14, [r2], #4 + BGT celt_pitch_xcorr_edsp_process_remaining +celt_pitch_xcorr_edsp_done + LDMFD sp!, {r4-r11, pc} + ENDP + +ENDIF + +END diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h new file mode 100644 index 00000000..a07f8ac2 --- /dev/null +++ b/celt/arm/pitch_arm.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(PITCH_ARM_H) +# define PITCH_ARM_H + +# include "armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +# endif + +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) +# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr) +# endif + +# if defined(OPUS_ARM_MAY_HAVE_EDSP) +opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_PITCH_XCORR (1) +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) +# endif + +# endif + +#endif diff --git a/celt/celt.h b/celt/celt.h index 1d41f5fc..5deea1f0 100644 --- a/celt/celt.h +++ b/celt/celt.h @@ -122,7 +122,8 @@ int celt_encoder_get_size(int channels); int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc); -int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels); +int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels, + int arch); diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index e61f8807..e34b4c6d 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -447,10 +447,11 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R { VARDECL( opus_val16, lp_pitch_buf ); ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 ); - pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C); + pitch_downsample(decode_mem, lp_pitch_buf, + DECODE_BUFFER_SIZE, C, st->arch); pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf, DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX, - PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index); + PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch); pitch_index = PLC_PITCH_LAG_MAX-pitch_index; st->last_pitch_index = pitch_index; } else { @@ -481,7 +482,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R opus_val32 ac[LPC_ORDER+1]; /* Compute LPC coefficients for the last MAX_PERIOD samples before the first loss so we can work in the excitation-filter domain. */ - _celt_autocorr(exc, ac, window, overlap, LPC_ORDER, MAX_PERIOD); + _celt_autocorr(exc, ac, window, overlap, + LPC_ORDER, MAX_PERIOD, st->arch); /* Add a noise floor of -40 dB. */ #ifdef FIXED_POINT ac[0] += SHR32(ac[0],13); diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 93562c8f..de990a05 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -161,17 +161,8 @@ CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int } #endif /* CUSTOM_MODES */ -int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels) -{ - int ret; - ret = opus_custom_encoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels); - if (ret != OPUS_OK) - return ret; - st->upsample = resampling_factor(sampling_rate); - return OPUS_OK; -} - -OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels) +static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode, + int channels, int arch) { if (channels < 0 || channels > 2) return OPUS_BAD_ARG; @@ -190,7 +181,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod st->end = st->mode->effEBands; st->signalling = 1; - st->arch = opus_select_arch(); + st->arch = arch; st->constrained_vbr = 1; st->clip = 1; @@ -206,6 +197,23 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod return OPUS_OK; } +OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels) +{ + return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch()); +} + +int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels, + int arch) +{ + int ret; + ret = opus_custom_encoder_init_arch(st, + opus_custom_mode_create(48000, 960, NULL), channels, arch); + if (ret != OPUS_OK) + return ret; + st->upsample = resampling_factor(sampling_rate); + return OPUS_OK; +} + #ifdef CUSTOM_MODES void opus_custom_encoder_destroy(CELTEncoder *st) { @@ -1023,11 +1031,12 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, VARDECL(opus_val16, pitch_buf); ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16); - pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC); + pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch); /* Don't search for the fir last 1.5 octave of the range because there's too many false-positives due to short-term correlation */ pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N, - COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index); + COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index, + st->arch); pitch_index = COMBFILTER_MAXPERIOD-pitch_index; gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD, diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index 7ffe90a3..fa29d626 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -226,7 +226,8 @@ int _celt_autocorr( const opus_val16 *window, int overlap, int lag, - int n + int n, + int arch ) { opus_val32 d; @@ -275,7 +276,7 @@ int _celt_autocorr( shift = 0; } #endif - celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1); + celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch); for (k=0;k<=lag;k++) { for (i = k+fastN, d = 0; i < n; i++) diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index 19279a0e..dc2a0a3d 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -48,6 +48,7 @@ void celt_iir(const opus_val32 *x, int ord, opus_val16 *mem); -int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n); +int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, + const opus_val16 *window, int overlap, int lag, int n, int arch); #endif /* PLC_H */ diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 5ade846e..d68dbe62 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -31,7 +31,7 @@ #include "opus_types.h" #include "opus_defines.h" -#if defined(OPUS_HAVE_RTCD) && defined(ARMv4_ASM) +#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) #include "arm/armcpu.h" /* We currently support 4 ARM variants: diff --git a/celt/pitch.c b/celt/pitch.c index 0352b302..2d63a5ac 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -145,7 +145,7 @@ static void celt_fir5(const opus_val16 *x, void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, - int len, int C) + int len, int C, int arch) { int i; opus_val32 ac[5]; @@ -180,7 +180,7 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x } _celt_autocorr(x_lp, ac, NULL, 0, - 4, len>>1); + 4, len>>1, arch); /* Noise floor -40 dB */ #ifdef FIXED_POINT @@ -250,7 +250,7 @@ opus_val32 #else void #endif -celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) +celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) { int i,j; #ifdef FIXED_POINT @@ -289,7 +289,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, #endif void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y, - int len, int max_pitch, int *pitch) + int len, int max_pitch, int *pitch, int arch) { int i, j; int lag; @@ -342,7 +342,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR #ifdef FIXED_POINT maxcorr = #endif - celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2); + celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch); find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch #ifdef FIXED_POINT diff --git a/celt/pitch.h b/celt/pitch.h index 3be5e277..a1074ef2 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -35,16 +35,21 @@ #define PITCH_H #include "modes.h" +#include "cpu_support.h" #if defined(__SSE__) && !defined(FIXED_POINT) #include "x86/pitch_sse.h" #endif +#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +# include "arm/pitch_arm.h" +#endif + void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, - int len, int C); + int len, int C, int arch); void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y, - int len, int max_pitch, int *pitch); + int len, int max_pitch, int *pitch, int arch); opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, int N, int *T0, int prev_period, opus_val16 prev_gain); @@ -140,6 +145,52 @@ opus_val32 #else void #endif -celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch); +celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); + +#if !defined(OVERRIDE_PITCH_XCORR) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) +extern +# if defined(FIXED_POINT) +opus_val32 +# else +void +# endif +(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int, int); + +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ + xcorr, len, max_pitch)) +# else +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch)) +# endif +#else + +/*static inline opus_val32 real_celt_pitch_xcorr(const opus_val16 *_x, + const opus_val16 *_y,opus_val32 *xcorr,int len,int max_pitch,int arch){ + opus_val32 *xcorr_tmp; + opus_val32 ret_tmp; + opus_val32 ret; + int i; + xcorr_tmp=(opus_val32 *)malloc(max_pitch*sizeof(*xcorr)); + ret_tmp=celt_pitch_xcorr_c(_x,_y,xcorr_tmp,len,max_pitch); + ret=celt_pitch_xcorr(_x,_y,xcorr,len,max_pitch,arch); + for(i=0;i<max_pitch;i++)if(xcorr[i]!=xcorr_tmp[i]){ + fprintf(stderr,"xcorr[%i] (0x%08X) != xcorr_tmp[%i] (0x%08X)\n", + i,xcorr[i],i,xcorr_tmp[i]); + } + if(ret!=ret_tmp){ + fprintf(stderr,"ret (0x%08X) != ret_tmp (0x%08X)\n",ret,ret_tmp); + } + return ret_tmp; +} + +#undef celt_pitch_xcorr +#define celt_pitch_xcorr real_celt_pitch_xcorr*/ + +#endif #endif diff --git a/celt_headers.mk b/celt_headers.mk index 067b2940..8811e167 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -35,4 +35,5 @@ celt/arm/fixed_armv4.h \ celt/arm/fixed_armv5e.h \ celt/arm/kiss_fft_armv4.h \ celt/arm/kiss_fft_armv5e.h \ +celt/arm/pitch_arm.h \ celt/x86/pitch_sse.h diff --git a/celt_sources.mk b/celt_sources.mk index e1aee8fa..2bbe7700 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -18,4 +18,11 @@ celt/rate.c \ celt/vq.c CELT_SOURCES_ARM = \ -celt/arm/armcpu.c +celt/arm/armcpu.c \ +celt/arm/arm_celt_map.c + +CELT_SOURCES_ARM_ASM = \ +celt/arm/celt_pitch_xcorr_arm.s + +CELT_AM_SOURCES_ARM_ASM = \ +celt/arm/armopts.s.in diff --git a/configure.ac b/configure.ac index e511558d..bacb2f93 100644 --- a/configure.ac +++ b/configure.ac @@ -42,6 +42,8 @@ AC_PROG_CC_C99 AC_C_CONST AC_C_INLINE +AM_PROG_AS + AC_DEFINE([OPUS_BUILD], [], [This is a build of OPUS]) #Use a hacked up version of autoconf's AC_C_RESTRICT because it's not @@ -54,13 +56,13 @@ AC_CACHE_CHECK([for C/C++ restrict keyword], ac_cv_c_restrict, for ac_kw in __restrict __restrict__ _Restrict restrict; do AC_COMPILE_IFELSE([AC_LANG_PROGRAM( [[typedef int * int_ptr; - int foo (int_ptr $ac_kw ip, int * $ac_kw baz[]) { - return ip[0]; + int foo (int_ptr $ac_kw ip, int * $ac_kw baz[]) { + return ip[0]; }]], [[int s[1]; - int * $ac_kw t = s; - t[0] = 0; - return foo(t, (void *)0)]])], + int * $ac_kw t = s; + t[0] = 0; + return foo(t, (void *)0)]])], [ac_cv_c_restrict=$ac_kw]) test "$ac_cv_c_restrict" != no && break done @@ -165,7 +167,7 @@ has_float_approx=no #i[[3456]]86 | x86_64 | powerpc64 | powerpc32 | ia64) # has_float_approx=yes # ;; -#esac +#esac AC_ARG_ENABLE([float-approx], [AS_HELP_STRING([--enable-float-approx], [enable fast approximations for floating point])], @@ -183,55 +185,167 @@ AC_ARG_ENABLE([asm], [AS_HELP_STRING([--disable-asm], [Disable assembly optimizations])],, [enable_asm=yes]) +AC_ARG_ENABLE([rtcd], + [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],, + [enable_rtcd=yes]) + rtcd_support=no cpu_arm=no -AS_IF([test "$enable_asm" = "yes"],[ - asm_optimization="no asm for your platform, please send patches" +AS_IF([test x"${enable_asm}" = x"yes"],[ + inline_optimization="No ASM for your platform, please send patches" case $host_cpu in - arm*) - cpu_arm=yes - AS_GCC_INLINE_ASSEMBLY([asm_optimization="ARM"], - [asm_optimization="disabled"]) - if test "$asm_optimization" = "ARM" ; then - rtcd_support=yes - AC_DEFINE([ARMv4_ASM], 1, [Use generic ARMv4 asm optimizations]) - AS_ASM_ARM_EDSP([ARMv5E_ASM=1],[ARMv5E_ASM=0]) - if test "$ARMv5E_ASM" = "1" ; then - AC_DEFINE([ARMv5E_ASM], [1], [Use ARMv5E asm optimizations]) - asm_optimization="$asm_optimization (EDSP)" - fi - AS_ASM_ARM_MEDIA([ARMv6_ASM=1],[ARMv6_ASM=0]) - if test "$ARMv6_ASM" = "1" ; then - AC_DEFINE([ARMv6_ASM], [1], [Use ARMv6 asm optimizations]) - asm_optimization="$asm_optimization (Media)" - fi - AS_ASM_ARM_NEON([ARM_HAVE_NEON=1],[ARM_HAVE_NEON=0]) - if test "$ARM_HAVE_NEON" = "1" ; then - AC_DEFINE([ARM_HAVE_NEON], 1, [Use ARM NEON optimizations]) - asm_optimization="$asm_optimization (NEON)" - fi - fi + arm*) + dnl Currently we only have asm for fixed-point + AS_IF([test "$enable_float" != "yes"],[ + cpu_arm=yes + AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm optimization]) + AS_GCC_INLINE_ASSEMBLY( + [inline_optimization="ARM"], + [inline_optimization="disabled"] + ) + AS_ASM_ARM_EDSP([OPUS_ARM_INLINE_EDSP=1],[OPUS_ARM_INLINE_EDSP=0]) + AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1], + [OPUS_ARM_INLINE_MEDIA=0]) + AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0]) + AS_IF([test x"$inline_optimization" = x"ARM"],[ + AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true]) + AC_DEFINE([OPUS_ARM_INLINE_ASM], 1, + [Use generic ARMv4 inline asm optimizations]) + AS_IF([test x"$OPUS_ARM_INLINE_EDSP" = x"1"],[ + AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1], + [Use ARMv5E inline asm optimizations]) + inline_optimization="$inline_optimization (EDSP)" + ]) + AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[ + AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1], + [Use ARMv6 inline asm optimizations]) + inline_optimization="$inline_optimization (Media)" + ]) + AS_IF([test x"$OPUS_ARM_INLINE_NEON" = x"1"],[ + AC_DEFINE([OPUS_ARM_INLINE_NEON], 1, + [Use ARM NEON inline asm optimizations]) + inline_optimization="$inline_optimization (NEON)" + ]) + ]) + dnl We need Perl to translate RVCT-syntax asm to gas syntax. + AC_CHECK_PROG([HAVE_PERL], perl, yes, no) + AS_IF([test x"$HAVE_PERL" = x"yes"],[ + AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],[true]) + asm_optimization="ARM" + AS_IF([test x"$OPUS_ARM_INLINE_EDSP" = x"1"], [ + OPUS_ARM_PRESUME_EDSP=1 + OPUS_ARM_MAY_HAVE_EDSP=1 + ], + [ + OPUS_ARM_PRESUME_EDSP=0 + OPUS_ARM_MAY_HAVE_EDSP=0 + ]) + AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"], [ + OPUS_ARM_PRESUME_MEDIA=1 + OPUS_ARM_MAY_HAVE_MEDIA=1 + ], + [ + OPUS_ARM_PRESUME_MEDIA=0 + OPUS_ARM_MAY_HAVE_MEDIA=0 + ]) + AS_IF([test x"$OPUS_ARM_INLINE_NEON" = x"1"], [ + OPUS_ARM_PRESUME_NEON=1 + OPUS_ARM_MAY_HAVE_NEON=1 + ], + [ + OPUS_ARM_PRESUME_NEON=0 + OPUS_ARM_MAY_HAVE_NEON=0 + ]) + AS_IF([test x"$enable_rtcd" = x"yes"],[ + AS_IF([test x"$OPUS_ARM_MAY_HAVE_EDSP" != x"1"],[ + AC_MSG_NOTICE( + [Trying to force-enable armv5e EDSP instructions...]) + AS_ASM_ARM_EDSP_FORCE([OPUS_ARM_MAY_HAVE_EDSP=1]) + ]) + AS_IF([test x"$OPUS_ARM_MAY_HAVE_MEDIA" != x"1"],[ + AC_MSG_NOTICE( + [Trying to force-enable ARMv6 media instructions...]) + AS_ASM_ARM_MEDIA_FORCE([OPUS_ARM_MAY_HAVE_MEDIA=1]) + ]) + AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON" != x"1"],[ + AC_MSG_NOTICE( + [Trying to force-enable NEON instructions...]) + AS_ASM_ARM_NEON_FORCE([OPUS_ARM_MAY_HAVE_NEON=1]) + ]) + ]) + rtcd_support= + AS_IF([test x"$OPUS_ARM_MAY_HAVE_EDSP" = x"1"],[ + AC_DEFINE(OPUS_ARM_MAY_HAVE_EDSP, 1, + [Define if assembler supports EDSP instructions]) + AS_IF([test x"$OPUS_ARM_PRESUME_EDSP" = x"1"],[ + AC_DEFINE(OPUS_ARM_PRESUME_EDSP, 1, + [Define if binary requires EDSP instruction support]) + asm_optimization="$asm_optimization (EDSP)" + ], + [rtcd_support="$rtcd_support (EDSP)"] + ) + ]) + AC_SUBST(OPUS_ARM_MAY_HAVE_EDSP) + AS_IF([test x"$OPUS_ARM_MAY_HAVE_MEDIA" = x"1"],[ + AC_DEFINE(OPUS_ARM_MAY_HAVE_MEDIA, 1, + [Define if assembler supports ARMv6 media instructions]) + AS_IF([test x"$OPUS_ARM_PRESUME_MEDIA" = x"1"],[ + AC_DEFINE(OPUS_ARM_PRESUME_MEDIA, 1, + [Define if binary requires ARMv6 media instruction support]) + asm_optimization="$asm_optimization (Media)" + ], + [rtcd_support="$rtcd_support (Media)"] + ) + ]) + AC_SUBST(OPUS_ARM_MAY_HAVE_MEDIA) + AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON" = x"1"],[ + AC_DEFINE(OPUS_ARM_MAY_HAVE_NEON, 1, + [Define if compiler supports NEON instructions]) + AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [ + AC_DEFINE(OPUS_ARM_PRESUME_NEON, 1, + [Define if binary requires NEON instruction support]) + asm_optimization="$asm_optimization (NEON)" + ], + [rtcd_support="$rtcd_support (NEON)"] + ) + ]) + AC_SUBST(OPUS_ARM_MAY_HAVE_NEON) + dnl Make sure turning on RTCD gets us at least one + dnl instruction set. + AS_IF([test x"$rtcd_support" != x""], + [rtcd_support=ARM"$rtcd_support"], + [rtcd_support="no"] + ) + ], + [ + AC_MSG_WARN( + [*** ARM assembly requires perl -- disabling optimizations]) + asm_optimization="(missing perl dependency for ARM)" + ]) + ]) ;; esac ],[ - asm_optimization="disabled" + inline_optimization="disabled" + asm_optimization="disabled" ]) AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) - -AC_ARG_ENABLE([rtcd], - [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],, - [enable_rtcd=yes]) - -AS_IF([test "$enable_rtcd" = "yes"],[ - AS_IF([test "$rtcd_support" = "yes"],[ - AC_DEFINE([OPUS_HAVE_RTCD], [1], [Use run-time CPU capabilities detection]) - ],[ - rtcd_support="no rtcd for your platform, please send patches" +AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], + [test x"${inline_optimization:0:3}" = x"ARM"]) +AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], + [test x"${asm_optimization:0:3}" = x"ARM"]) + +AS_IF([test x"$enable_rtcd" = x"yes"],[ + AS_IF([test x"$rtcd_support" != x"no"],[ + AC_DEFINE([OPUS_HAVE_RTCD], [1], + [Use run-time CPU capabilities detection]) + OPUS_HAVE_RTCD=1 + AC_SUBST(OPUS_HAVE_RTCD) ]) ],[ - rtcd_support="no" + rtcd_support="disabled" ]) AC_ARG_ENABLE([assertions], @@ -300,9 +414,14 @@ AC_CHECK_FUNCS([__malloc_hook]) AC_SUBST([PC_BUILD]) - -AC_CONFIG_FILES([Makefile opus.pc opus-uninstalled.pc - doc/Makefile doc/Doxyfile]) +AC_CONFIG_FILES([ + Makefile + opus.pc + opus-uninstalled.pc + celt/arm/armopts.s + doc/Makefile + doc/Doxyfile +]) AC_CONFIG_HEADERS([config.h]) AC_OUTPUT @@ -316,13 +435,14 @@ AC_MSG_NOTICE([ C99 var arrays: ................ ${has_var_arrays} C99 lrintf: .................... ${ac_cv_func_lrintf} Use alloca: .................... ${use_alloca} - + General configuration: - + Floating point support: ........ ${enable_float} Fast float approximations: ..... ${enable_float_approx} Fixed point debugging: ......... ${enable_fixed_point_debug} - Assembly optimization: ......... ${asm_optimization} + Inline Assembly Optimizations: . ${inline_optimization} + External Assembly Optimizations: ${asm_optimization} Run-time CPU detection: ........ ${rtcd_support} Custom modes: .................. ${enable_custom_modes} Assertion checking: ............ ${enable_assertions} diff --git a/m4/as-gcc-inline-assembly.m4 b/m4/as-gcc-inline-assembly.m4 index a6c44a52..b0d2da41 100644 --- a/m4/as-gcc-inline-assembly.m4 +++ b/m4/as-gcc-inline-assembly.m4 @@ -42,6 +42,16 @@ AC_DEFUN([AS_ASM_ARM_NEON], $2]) ]) +AC_DEFUN([AS_ASM_ARM_NEON_FORCE], +[ + AC_MSG_CHECKING([if assembler supports NEON instructions on ARM]) + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv7-a\n.fpu neon\n.object_arch armv4t\nvorr d0,d0,d0")])], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) +]) AC_DEFUN([AS_ASM_ARM_MEDIA], [ @@ -54,6 +64,16 @@ AC_DEFUN([AS_ASM_ARM_MEDIA], $2]) ]) +AC_DEFUN([AS_ASM_ARM_MEDIA_FORCE], +[ + AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM]) + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv6\n.object_arch armv4t\nshadd8 r3,r3,r3")])], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) +]) AC_DEFUN([AS_ASM_ARM_EDSP], [ @@ -65,3 +85,14 @@ AC_DEFUN([AS_ASM_ARM_EDSP], [AC_MSG_RESULT([no]) $2]) ]) + +AC_DEFUN([AS_ASM_ARM_EDSP_FORCE], +[ + AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM]) + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[__asm__(".arch armv5te\n.object_arch armv4t\nqadd r3,r3,r3")])], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) +]) @@ -64,6 +64,7 @@ opus_int silk_Get_Encoder_Size( /* O Returns error co /*************************/ opus_int silk_InitEncoder( /* O Returns error code */ void *encState, /* I/O State */ + int arch, /* I Run-time architecture */ silk_EncControlStruct *encStatus /* O Encoder Status */ ); diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index 97f79d45..1b580579 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -227,7 +227,8 @@ void silk_autocorr( opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *inputData, /* I Input data to correlate */ const opus_int inputDataSize, /* I Length of input */ - const opus_int correlationCount /* I Number of correlation taps to compute */ + const opus_int correlationCount, /* I Number of correlation taps to compute */ + int arch /* I Run-time architecture */ ); void silk_decode_pitch( @@ -249,7 +250,8 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0 const opus_int search_thres2_Q13, /* I Final threshold for lag candidates 0 - 1 */ const opus_int Fs_kHz, /* I Sample frequency (kHz) */ const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ - const opus_int nb_subfr /* I number of 5 ms subframes */ + const opus_int nb_subfr, /* I number of 5 ms subframes */ + int arch /* I Run-time architecture */ ); /* Compute Normalized Line Spectral Frequencies (NLSFs) from whitening filter coefficients */ @@ -309,7 +311,8 @@ void silk_burg_modified( const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I Number of subframes stacked in x */ - const opus_int D /* I Order */ + const opus_int D, /* I Order */ + int arch /* I Run-time architecture */ ); /* Copy and multiply a vector by a constant */ @@ -576,11 +579,11 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) #include "MacroCount.h" #include "MacroDebug.h" -#ifdef ARMv4_ASM +#ifdef OPUS_ARM_INLINE_ASM #include "arm/SigProc_FIX_armv4.h" #endif -#ifdef ARMv5E_ASM +#ifdef OPUS_ARM_INLINE_EDSP #include "arm/SigProc_FIX_armv5e.h" #endif diff --git a/silk/enc_API.c b/silk/enc_API.c index b3424872..43739efc 100644 --- a/silk/enc_API.c +++ b/silk/enc_API.c @@ -69,6 +69,7 @@ opus_int silk_Get_Encoder_Size( /* O Returns error co /*************************/ opus_int silk_InitEncoder( /* O Returns error code */ void *encState, /* I/O State */ + int arch, /* I Run-time architecture */ silk_EncControlStruct *encStatus /* O Encoder Status */ ) { @@ -80,7 +81,7 @@ opus_int silk_InitEncoder( /* O Returns error co /* Reset encoder */ silk_memset( psEnc, 0, sizeof( silk_encoder ) ); for( n = 0; n < ENCODER_NUM_CHANNELS; n++ ) { - if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ] ) ) { + if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ], arch ) ) { silk_assert( 0 ); } } @@ -174,7 +175,7 @@ opus_int silk_Encode( /* O Returns error co if( encControl->nChannelsInternal > psEnc->nChannelsInternal ) { /* Mono -> Stereo transition: init state of second channel and stereo state */ - ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ] ); + ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ], psEnc->state_Fxx[ 0 ].sCmn.arch ); silk_memset( psEnc->sStereo.pred_prev_Q13, 0, sizeof( psEnc->sStereo.pred_prev_Q13 ) ); silk_memset( psEnc->sStereo.sSide, 0, sizeof( psEnc->sStereo.sSide ) ); psEnc->sStereo.mid_side_amp_Q0[ 0 ] = 0; @@ -206,9 +207,8 @@ opus_int silk_Encode( /* O Returns error co } /* Reset Encoder */ for( n = 0; n < encControl->nChannelsInternal; n++ ) { - if( (ret = silk_init_encoder( &psEnc->state_Fxx[ n ] ) ) != 0 ) { - silk_assert( 0 ); - } + ret = silk_init_encoder( &psEnc->state_Fxx[ n ], psEnc->state_Fxx[ n ].sCmn.arch ); + silk_assert( !ret ); } tmp_payloadSize_ms = encControl->payloadSize_ms; encControl->payloadSize_ms = 10; diff --git a/silk/fixed/autocorr_FIX.c b/silk/fixed/autocorr_FIX.c index dec3cc0b..de95c986 100644 --- a/silk/fixed/autocorr_FIX.c +++ b/silk/fixed/autocorr_FIX.c @@ -38,10 +38,11 @@ void silk_autocorr( opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *inputData, /* I Input data to correlate */ const opus_int inputDataSize, /* I Length of input */ - const opus_int correlationCount /* I Number of correlation taps to compute */ + const opus_int correlationCount, /* I Number of correlation taps to compute */ + int arch /* I Run-time architecture */ ) { opus_int corrCount; corrCount = silk_min_int( inputDataSize, correlationCount ); - *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize); + *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize, arch); } diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c index f5112b64..db348295 100644 --- a/silk/fixed/burg_modified_FIX.c +++ b/silk/fixed/burg_modified_FIX.c @@ -50,7 +50,8 @@ void silk_burg_modified( const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I Number of subframes stacked in x */ - const opus_int D /* I Order */ + const opus_int D, /* I Order */ + int arch /* I Run-time architecture */ ) { opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; @@ -98,7 +99,7 @@ void silk_burg_modified( int i; opus_int32 d; x_ptr = x + s * subfr_length; - celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D ); + celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch ); for( n = 1; n < D + 1; n++ ) { for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ ) d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] ); diff --git a/silk/fixed/encode_frame_FIX.c b/silk/fixed/encode_frame_FIX.c index 9c57a10a..b490986b 100644 --- a/silk/fixed/encode_frame_FIX.c +++ b/silk/fixed/encode_frame_FIX.c @@ -132,12 +132,12 @@ opus_int silk_encode_frame_FIX( /*****************************************/ /* Find pitch lags, initial LPC analysis */ /*****************************************/ - silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame ); + silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch ); /************************/ /* Noise shape analysis */ /************************/ - silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame ); + silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame, psEnc->sCmn.arch ); /***************************************************/ /* Find linear prediction coefficients (LPC + LTP) */ diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c index 70cefb6b..783d32e2 100644 --- a/silk/fixed/find_LPC_FIX.c +++ b/silk/fixed/find_LPC_FIX.c @@ -60,13 +60,13 @@ void silk_find_LPC_FIX( psEncC->indices.NLSFInterpCoef_Q2 = 4; /* Burg AR analysis for the full frame */ - silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder ); + silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, psEncC->arch ); if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) { VARDECL( opus_int16, LPC_res ); /* Optimal solution for last 10 ms */ - silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder ); + silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder, psEncC->arch ); /* subtract residual energy here, as that's easier than adding it to the */ /* residual energy of the first 10 ms in each iteration of the search below */ diff --git a/silk/fixed/find_pitch_lags_FIX.c b/silk/fixed/find_pitch_lags_FIX.c index f60b4364..620f8dcd 100644 --- a/silk/fixed/find_pitch_lags_FIX.c +++ b/silk/fixed/find_pitch_lags_FIX.c @@ -38,7 +38,8 @@ void silk_find_pitch_lags_FIX( silk_encoder_state_FIX *psEnc, /* I/O encoder state */ silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ opus_int16 res[], /* O residual */ - const opus_int16 x[] /* I Speech signal */ + const opus_int16 x[], /* I Speech signal */ + int arch /* I Run-time architecture */ ) { opus_int buf_len, i, scale; @@ -86,7 +87,7 @@ void silk_find_pitch_lags_FIX( silk_apply_sine_window( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch ); /* Calculate autocorrelation sequence */ - silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 ); + silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch ); /* Add white noise, as fraction of energy */ auto_corr[ 0 ] = silk_SMLAWB( auto_corr[ 0 ], auto_corr[ 0 ], SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) ) + 1; @@ -127,7 +128,8 @@ void silk_find_pitch_lags_FIX( /*****************************************/ if( silk_pitch_analysis_core( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex, &psEnc->sCmn.indices.contourIndex, &psEnc->LTPCorr_Q15, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16, - (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr ) == 0 ) + (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr, + psEnc->sCmn.arch) == 0 ) { psEnc->sCmn.indices.signalType = TYPE_VOICED; } else { diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h index d999b13f..a56ca07a 100644 --- a/silk/fixed/main_FIX.h +++ b/silk/fixed/main_FIX.h @@ -73,7 +73,8 @@ opus_int silk_encode_frame_FIX( /* Initializes the Silk encoder state */ opus_int silk_init_encoder( - silk_encoder_state_Fxx *psEnc /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk FIX encoder state */ + int arch /* I Run-time architecture */ ); /* Control the Silk encoder */ @@ -104,7 +105,8 @@ void silk_noise_shape_analysis_FIX( silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */ silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */ const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */ - const opus_int16 *x /* I Input signal [ frame_length + la_shape ] */ + const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */ + int arch /* I Run-time architecture */ ); /* Autocorrelations for a warped frequency axis */ @@ -132,7 +134,8 @@ void silk_find_pitch_lags_FIX( silk_encoder_state_FIX *psEnc, /* I/O encoder state */ silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ opus_int16 res[], /* O residual */ - const opus_int16 x[] /* I Speech signal */ + const opus_int16 x[], /* I Speech signal */ + int arch /* I Run-time architecture */ ); /* Find LPC and LTP coefficients */ diff --git a/silk/fixed/noise_shape_analysis_FIX.c b/silk/fixed/noise_shape_analysis_FIX.c index d381fa3d..e24d2e9d 100644 --- a/silk/fixed/noise_shape_analysis_FIX.c +++ b/silk/fixed/noise_shape_analysis_FIX.c @@ -145,7 +145,8 @@ void silk_noise_shape_analysis_FIX( silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */ silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */ const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */ - const opus_int16 *x /* I Input signal [ frame_length + la_shape ] */ + const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */ + int arch /* I Run-time architecture */ ) { silk_shape_state_FIX *psShapeSt = &psEnc->sShape; @@ -281,7 +282,7 @@ void silk_noise_shape_analysis_FIX( silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); } else { /* Calculate regular auto correlation */ - silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 ); + silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch ); } /* Add white noise, as a fraction of energy */ diff --git a/silk/fixed/pitch_analysis_core_FIX.c b/silk/fixed/pitch_analysis_core_FIX.c index b2c6cbb4..1641a0fb 100644 --- a/silk/fixed/pitch_analysis_core_FIX.c +++ b/silk/fixed/pitch_analysis_core_FIX.c @@ -62,7 +62,8 @@ static void silk_P_Ana_calc_corr_st3( opus_int start_lag, /* I lag offset to search around */ opus_int sf_length, /* I length of a 5 ms subframe */ opus_int nb_subfr, /* I number of subframes */ - opus_int complexity /* I Complexity setting */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ ); static void silk_P_Ana_calc_energy_st3( @@ -88,7 +89,8 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0 const opus_int search_thres2_Q13, /* I Final threshold for lag candidates 0 - 1 */ const opus_int Fs_kHz, /* I Sample frequency (kHz) */ const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ - const opus_int nb_subfr /* I number of 5 ms subframes */ + const opus_int nb_subfr, /* I number of 5 ms subframes */ + int arch /* I Run-time architecture */ ) { VARDECL( opus_int16, frame_8kHz ); @@ -189,7 +191,7 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0 silk_assert( basis_ptr >= frame_4kHz ); silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz ); - celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1 ); + celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1, arch ); /* Calculate first vector products before loop */ cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ]; @@ -516,7 +518,7 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0 /* Calculate the correlations and energies needed in stage 3 */ ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals ); ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals ); - silk_P_Ana_calc_corr_st3( cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity ); + silk_P_Ana_calc_corr_st3( cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch ); silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity ); lag_counter = 0; @@ -597,7 +599,8 @@ static void silk_P_Ana_calc_corr_st3( opus_int start_lag, /* I lag offset to search around */ opus_int sf_length, /* I length of a 5 ms subframe */ opus_int nb_subfr, /* I number of subframes */ - opus_int complexity /* I Complexity setting */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ ) { const opus_int16 *target_ptr; @@ -634,7 +637,7 @@ static void silk_P_Ana_calc_corr_st3( lag_low = matrix_ptr( Lag_range_ptr, k, 0, 2 ); lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 ); silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE); - celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1 ); + celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1, arch ); for( j = lag_low; j <= lag_high; j++ ) { silk_assert( lag_counter < SCRATCH_SIZE ); scratch_mem[ lag_counter ] = xcorr32[ lag_high - j ]; diff --git a/silk/float/SigProc_FLP.h b/silk/float/SigProc_FLP.h index e7131f14..f0cb3733 100644 --- a/silk/float/SigProc_FLP.h +++ b/silk/float/SigProc_FLP.h @@ -94,7 +94,8 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, const silk_float search_thres2, /* I Final threshold for lag candidates 0 - 1 */ const opus_int Fs_kHz, /* I sample frequency (kHz) */ const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ - const opus_int nb_subfr /* I Number of 5 ms subframes */ + const opus_int nb_subfr, /* I Number of 5 ms subframes */ + int arch /* I Run-time architecture */ ); void silk_insertion_sort_decreasing_FLP( diff --git a/silk/float/encode_frame_FLP.c b/silk/float/encode_frame_FLP.c index df75db91..d54e2686 100644 --- a/silk/float/encode_frame_FLP.c +++ b/silk/float/encode_frame_FLP.c @@ -129,7 +129,7 @@ opus_int silk_encode_frame_FLP( /*****************************************/ /* Find pitch lags, initial LPC analysis */ /*****************************************/ - silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame ); + silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch ); /************************/ /* Noise shape analysis */ diff --git a/silk/float/find_pitch_lags_FLP.c b/silk/float/find_pitch_lags_FLP.c index 455db82f..f3b22d25 100644 --- a/silk/float/find_pitch_lags_FLP.c +++ b/silk/float/find_pitch_lags_FLP.c @@ -37,7 +37,8 @@ void silk_find_pitch_lags_FLP( silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ silk_float res[], /* O Residual */ - const silk_float x[] /* I Speech signal */ + const silk_float x[], /* I Speech signal */ + int arch /* I Run-time architecture */ ) { opus_int buf_len; @@ -116,7 +117,7 @@ void silk_find_pitch_lags_FLP( /*****************************************/ if( silk_pitch_analysis_core_FLP( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex, &psEnc->sCmn.indices.contourIndex, &psEnc->LTPCorr, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16 / 65536.0f, - thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr ) == 0 ) + thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr, arch ) == 0 ) { psEnc->sCmn.indices.signalType = TYPE_VOICED; } else { diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h index dbd34aef..71d58c20 100644 --- a/silk/float/main_FLP.h +++ b/silk/float/main_FLP.h @@ -71,7 +71,8 @@ opus_int silk_encode_frame_FLP( /* Initializes the Silk encoder state */ opus_int silk_init_encoder( - silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + int arch /* I Run-tim architecture */ ); /* Control the Silk encoder */ @@ -129,7 +130,8 @@ void silk_find_pitch_lags_FLP( silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ silk_float res[], /* O Residual */ - const silk_float x[] /* I Speech signal */ + const silk_float x[], /* I Speech signal */ + int arch /* I Run-time architecture */ ); /* Find LPC and LTP coefficients */ diff --git a/silk/float/pitch_analysis_core_FLP.c b/silk/float/pitch_analysis_core_FLP.c index b6bafe81..e58f041b 100644 --- a/silk/float/pitch_analysis_core_FLP.c +++ b/silk/float/pitch_analysis_core_FLP.c @@ -48,7 +48,8 @@ static void silk_P_Ana_calc_corr_st3( opus_int start_lag, /* I start lag */ opus_int sf_length, /* I sub frame length */ opus_int nb_subfr, /* I number of subframes */ - opus_int complexity /* I Complexity setting */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ ); static void silk_P_Ana_calc_energy_st3( @@ -74,7 +75,8 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, const silk_float search_thres2, /* I Final threshold for lag candidates 0 - 1 */ const opus_int Fs_kHz, /* I sample frequency (kHz) */ const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ - const opus_int nb_subfr /* I Number of 5 ms subframes */ + const opus_int nb_subfr, /* I Number of 5 ms subframes */ + int arch /* I Run-time architecture */ ) { opus_int i, k, d, j; @@ -176,7 +178,7 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, silk_assert( basis_ptr >= frame_4kHz ); silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz ); - celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1 ); + celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1, arch ); /* Calculate first vector products before loop */ cross_corr = xcorr[ max_lag_4kHz - min_lag_4kHz ]; @@ -409,7 +411,7 @@ opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, CCmax = -1000.0f; /* Calculate the correlations and energies needed in stage 3 */ - silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity ); + silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity, arch ); silk_P_Ana_calc_energy_st3( energies_st3, frame, start_lag, sf_length, nb_subfr, complexity ); lag_counter = 0; @@ -493,7 +495,8 @@ static void silk_P_Ana_calc_corr_st3( opus_int start_lag, /* I start lag */ opus_int sf_length, /* I sub frame length */ opus_int nb_subfr, /* I number of subframes */ - opus_int complexity /* I Complexity setting */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ ) { const silk_float *target_ptr; @@ -527,7 +530,7 @@ static void silk_P_Ana_calc_corr_st3( lag_low = matrix_ptr( Lag_range_ptr, k, 0, 2 ); lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 ); silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE); - celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1 ); + celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1, arch ); for( j = lag_low; j <= lag_high; j++ ) { silk_assert( lag_counter < SCRATCH_SIZE ); scratch_mem[ lag_counter ] = xcorr[ lag_high - j ]; diff --git a/silk/init_encoder.c b/silk/init_encoder.c index 7a4d918c..65995c33 100644 --- a/silk/init_encoder.c +++ b/silk/init_encoder.c @@ -34,12 +34,14 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FLP.h" #endif #include "tuning_parameters.h" +#include "cpu_support.h" /*********************************/ /* Initialize Silk Encoder state */ /*********************************/ opus_int silk_init_encoder( - silk_encoder_state_Fxx *psEnc /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk FIX encoder state */ + int arch /* I Run-time architecture */ ) { opus_int ret = 0; @@ -47,6 +49,8 @@ opus_int silk_init_encoder( /* Clear the entire encoder state */ silk_memset( psEnc, 0, sizeof( silk_encoder_state_Fxx ) ); + psEnc->sCmn.arch = arch; + psEnc->sCmn.variable_HP_smth1_Q15 = silk_LSHIFT( silk_lin2log( SILK_FIX_CONST( VARIABLE_HP_MIN_CUTOFF_HZ, 16 ) ) - ( 16 << 7 ), 8 ); psEnc->sCmn.variable_HP_smth2_Q15 = psEnc->sCmn.variable_HP_smth1_Q15; diff --git a/silk/macros.h b/silk/macros.h index 976f3b7d..a84e5a5d 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -103,11 +103,11 @@ static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32) (*((Matrix_base_adr) + ((row)+(M)*(column)))) #endif -#ifdef ARMv4_ASM +#ifdef OPUS_ARM_INLINE_ASM #include "arm/macros_armv4.h" #endif -#ifdef ARMv5E_ASM +#ifdef OPUS_ARM_INLINE_EDSP #include "arm/macros_armv5e.h" #endif diff --git a/silk/structs.h b/silk/structs.h index 4ccd0c1a..aa84a528 100644 --- a/silk/structs.h +++ b/silk/structs.h @@ -191,6 +191,8 @@ typedef struct { SideInfoIndices indices; opus_int8 pulses[ MAX_FRAME_LENGTH ]; + int arch; + /* Input/output buffering */ opus_int16 inputBuf[ MAX_FRAME_LENGTH + 2 ]; /* Buffer containing input signal */ opus_int inputBufIx; diff --git a/src/opus_decoder.c b/src/opus_decoder.c index b66f3b69..919ba521 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -75,7 +75,6 @@ struct OpusDecoder { #endif opus_uint32 rangeFinal; - int arch; }; #ifdef FIXED_POINT @@ -125,7 +124,6 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels) st->Fs = Fs; st->DecControl.API_sampleRate = st->Fs; st->DecControl.nChannelsAPI = st->channels; - st->arch = opus_select_arch(); /* Reset decoder */ ret = silk_InitDecoder( silk_dec ); diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 036143b8..7e3d9578 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -104,7 +104,7 @@ struct OpusEncoder { int analysis_offset; #endif opus_uint32 rangeFinal; - int arch; + int arch; }; /* Transition tables for the voice and music. First column is the @@ -188,7 +188,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat st->arch = opus_select_arch(); - ret = silk_InitEncoder( silk_enc, &st->silk_mode ); + ret = silk_InitEncoder( silk_enc, st->arch, &st->silk_mode ); if(ret)return OPUS_INTERNAL_ERROR; /* default SILK parameters */ @@ -209,7 +209,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat /* Create CELT encoder */ /* Initialize CELT encoder */ - err = celt_encoder_init(celt_enc, Fs, channels); + err = celt_encoder_init(celt_enc, Fs, channels, st->arch); if(err!=OPUS_OK)return OPUS_INTERNAL_ERROR; celt_encoder_ctl(celt_enc, CELT_SET_SIGNALLING(0)); @@ -1219,7 +1219,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY) { silk_EncControlStruct dummy; - silk_InitEncoder( silk_enc, &dummy); + silk_InitEncoder( silk_enc, st->arch, &dummy); prefill=1; } @@ -2418,7 +2418,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) ((char*)&st->OPUS_ENCODER_RESET_START - (char*)st)); celt_encoder_ctl(celt_enc, OPUS_RESET_STATE); - silk_InitEncoder( silk_enc, &dummy ); + silk_InitEncoder( silk_enc, st->arch, &dummy ); st->stream_channels = st->channels; st->hybrid_stereo_width_Q14 = 1 << 14; st->prev_HB_gain = Q15ONE; |