Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpovaddict <povaddict@users.sourceforge.net>2010-02-10 02:16:44 +0300
committerpovaddict <povaddict@users.sourceforge.net>2010-02-10 02:16:44 +0300
commit726a91b12a7524e45e7a901c9e4883af5b1bffe6 (patch)
treef5d25e3b2e84c92f4901280c73d5d3d7e6c3cd19 /src/filters/transform/MPCVideoDec/ffmpeg/libvo
parent02183f6e47ad4ea1057de9950482f291f2ae4290 (diff)
Rename several directories to use MixedCase instead of lowercase.
They now mostly match the case used in #includes, and they're consistent with the names of the .h files they contain. git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@1648 10f7b99b-c216-0410-bff0-8a66a9350fd8
Diffstat (limited to 'src/filters/transform/MPCVideoDec/ffmpeg/libvo')
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib.c151
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib_template.c361
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libvo/fastmemcpy.h42
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libvo/libvoinit.c22
4 files changed, 576 insertions, 0 deletions
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib.c b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib.c
new file mode 100644
index 000000000..8ea7b670b
--- /dev/null
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib.c
@@ -0,0 +1,151 @@
+#include "../libswscale/config.h"
+#ifdef USE_FASTMEMCPY
+
+/*
+ aclib - advanced C library ;)
+ This file contains functions which improve and expand standard C-library
+ see aclib_template.c ... this file only contains runtime cpu detection and config options stuff
+ runtime cpu detection by michael niedermayer (michaelni@gmx.at) is under GPL
+*/
+#include <string.h>
+#include <stddef.h>
+#include "../libavutil/x86_cpu.h"
+#include "../cpudetect.h"
+#include "fastmemcpy.h"
+#undef memcpy
+#include "ffImgfmt.h"
+
+#define BLOCK_SIZE 4096
+#define CONFUSION_FACTOR 0
+//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
+
+//#define STATISTICS
+#if ARCH_X86_32 || ARCH_X86_64
+#define CAN_COMPILE_X86_ASM
+#endif
+
+//Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
+//Plain C versions
+//#if !HAVE_MMX || defined (RUNTIME_CPUDETECT)
+//#define COMPILE_C
+//#endif
+
+#ifdef CAN_COMPILE_X86_ASM
+
+#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
+#define COMPILE_MMX
+#endif
+
+#if (HAVE_MMX2 && !HAVE_SSE2) || defined (RUNTIME_CPUDETECT)
+#define COMPILE_MMX2
+#endif
+
+#if (HAVE_AMD3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
+#define COMPILE_3DNOW
+#endif
+
+#if HAVE_SSE2 || defined (RUNTIME_CPUDETECT)
+#define COMPILE_SSE
+#endif
+
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#undef HAVE_SSE
+#undef HAVE_SSE2
+/*
+#ifdef COMPILE_C
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#undef ARCH_X86
+#define RENAME(a) a ## _C
+#include "aclib_template.c"
+#endif
+*/
+//MMX versions
+#ifdef COMPILE_MMX
+#undef RENAME
+#define HAVE_MMX 1
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#undef HAVE_SSE
+#undef HAVE_SSE2
+#define RENAME(a) a ## _MMX
+#include "aclib_template.c"
+#endif
+
+//MMX2 versions
+#ifdef COMPILE_MMX2
+#undef RENAME
+#define HAVE_MMX 1
+#define HAVE_MMX2 1
+#undef HAVE_AMD3DNOW
+#undef HAVE_SSE
+#undef HAVE_SSE2
+#define RENAME(a) a ## _MMX2
+#include "aclib_template.c"
+#endif
+
+//3DNOW versions
+#ifdef COMPILE_3DNOW
+#undef RENAME
+#define HAVE_MMX 1
+#undef HAVE_MMX2
+#define HAVE_AMD3DNOW 1
+#undef HAVE_SSE
+#undef HAVE_SSE2
+#define RENAME(a) a ## _3DNow
+#include "aclib_template.c"
+#endif
+
+//SSE versions (only used on SSE2 cpus)
+#ifdef COMPILE_SSE
+#undef RENAME
+#define HAVE_MMX 1
+#define HAVE_MMX2 1
+#undef HAVE_AMD3DNOW
+#define HAVE_SSE
+#define HAVE_SSE2
+#define RENAME(a) a ## _SSE
+#include "aclib_template.c"
+#endif
+
+#endif // CAN_COMPILE_X86_ASM
+
+void* (*fast_memcpy)(void * to, const void * from, size_t len)=NULL;
+
+void init_fast_memcpy(void)
+{
+#ifdef RUNTIME_CPUDETECT
+#ifdef CAN_COMPILE_X86_ASM
+ // ordered per speed fasterst first
+ if(gCpuCaps.hasSSE2)
+ fast_memcpy=fast_memcpy_SSE;
+ else if(gCpuCaps.hasMMX2)
+ fast_memcpy=fast_memcpy_MMX2;
+ else if(gCpuCaps.has3DNow)
+ fast_memcpy=fast_memcpy_3DNow;
+ else if(gCpuCaps.hasMMX)
+ fast_memcpy=fast_memcpy_MMX;
+ else
+#endif //CAN_COMPILE_X86_ASM
+ fast_memcpy=memcpy; // prior to mmx we use the standart memcpy
+#else
+#if HAVE_SSE2
+ fast_memcpy=fast_memcpy_SSE;
+#elif HAVE_MMX2
+ fast_memcpy=fast_memcpy_MMX2;
+#elif HAVE_AMD3DNOW
+ fast_memcpy=fast_memcpy_3DNow;
+#elif HAVE_MMX
+ fast_memcpy=fast_memcpy_MMX;
+#else
+ fast_memcpy=memcpy; // prior to mmx we use the standart memcpy
+#endif
+
+#endif //!RUNTIME_CPUDETECT
+}
+
+#endif /* use fastmemcpy */
+
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib_template.c b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib_template.c
new file mode 100644
index 000000000..27826f328
--- /dev/null
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/aclib_template.c
@@ -0,0 +1,361 @@
+/*
+ aclib - advanced C library ;)
+ This file contains functions which improve and expand standard C-library
+*/
+
+#ifndef HAVE_SSE2
+/*
+ P3 processor has only one SSE decoder so can execute only 1 sse insn per
+ cpu clock, but it has 3 mmx decoders (include load/store unit)
+ and executes 3 mmx insns per cpu clock.
+ P4 processor has some chances, but after reading:
+ http://www.emulators.com/pentium4.htm
+ I have doubts. Anyway SSE2 version of this code can be written better.
+*/
+#undef HAVE_SSE
+#endif
+
+
+/*
+ This part of code was taken by me from Linux-2.4.3 and slightly modified
+for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
+blocks but mplayer uses weakly ordered data and original sources can not
+speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
+
+>From IA-32 Intel Architecture Software Developer's Manual Volume 1,
+
+Order Number 245470:
+"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
+
+Data referenced by a program can be temporal (data will be used again) or
+non-temporal (data will be referenced once and not reused in the immediate
+future). To make efficient use of the processor's caches, it is generally
+desirable to cache temporal data and not cache non-temporal data. Overloading
+the processor's caches with non-temporal data is sometimes referred to as
+"polluting the caches".
+The non-temporal data is written to memory with Write-Combining semantics.
+
+The PREFETCHh instructions permits a program to load data into the processor
+at a suggested cache level, so that it is closer to the processors load and
+store unit when it is needed. If the data is already present in a level of
+the cache hierarchy that is closer to the processor, the PREFETCHh instruction
+will not result in any data movement.
+But we should you PREFETCHNTA: Non-temporal data fetch data into location
+close to the processor, minimizing cache pollution.
+
+The MOVNTQ (store quadword using non-temporal hint) instruction stores
+packed integer data from an MMX register to memory, using a non-temporal hint.
+The MOVNTPS (store packed single-precision floating-point values using
+non-temporal hint) instruction stores packed floating-point data from an
+XMM register to memory, using a non-temporal hint.
+
+The SFENCE (Store Fence) instruction controls write ordering by creating a
+fence for memory store operations. This instruction guarantees that the results
+of every store instruction that precedes the store fence in program order is
+globally visible before any store instruction that follows the fence. The
+SFENCE instruction provides an efficient way of ensuring ordering between
+procedures that produce weakly-ordered data and procedures that consume that
+data.
+
+If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
+*/
+
+// 3dnow memcpy support from kernel 2.4.2
+// by Pontscho/fresh!mindworkz
+
+
+#undef HAVE_MMX1
+#if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_AMD3DNOW) && !defined(HAVE_SSE)
+/* means: mmx v.1. Note: Since we added alignment of destinition it speedups
+ of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
+ standard (non MMX-optimized) version.
+ Note: on K6-2+ it speedups memory copying upto 25% and
+ on K7 and P3 about 500% (5 times). */
+#define HAVE_MMX1
+#endif
+
+
+#undef HAVE_K6_2PLUS
+#if !defined( HAVE_MMX2) && defined( HAVE_AMD3DNOW)
+#define HAVE_K6_2PLUS
+#endif
+
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy(to,from,n)\
+{\
+register unsigned long int dummy;\
+__asm__ __volatile__(\
+ "rep; movsb"\
+ :"=&D"(to), "=&S"(from), "=&c"(dummy)\
+/* It's most portable way to notify compiler */\
+/* that edi, esi and ecx are clobbered in asm block. */\
+/* Thanks to A'rpi for hint!!! */\
+ :"0" (to), "1" (from),"2" (n)\
+ : "memory");\
+}
+
+#undef MMREG_SIZE
+#ifdef HAVE_SSE
+#define MMREG_SIZE 16
+#else
+#define MMREG_SIZE 64 //8
+#endif
+
+#undef PREFETCH
+#undef EMMS
+
+#ifdef HAVE_MMX2
+#define PREFETCH "prefetchnta"
+#elif defined ( HAVE_AMD3DNOW )
+#define PREFETCH "prefetch"
+#else
+#define PREFETCH "/nop"
+#endif
+
+/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
+#ifdef HAVE_AMD3DNOW
+#define EMMS "femms"
+#else
+#define EMMS "emms"
+#endif
+
+#undef MOVNTQ
+#ifdef HAVE_MMX2
+#define MOVNTQ "movntq"
+#else
+#define MOVNTQ "movq"
+#endif
+
+#undef MIN_LEN
+#ifdef HAVE_MMX1
+#define MIN_LEN 0x800 /* 2K blocks */
+#else
+#define MIN_LEN 0x40 /* 64-byte blocks */
+#endif
+
+static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
+{
+ void *retval;
+ size_t i;
+ retval = to;
+#ifdef STATISTICS
+ {
+ static int freq[33];
+ static int t=0;
+ int i;
+ for(i=0; len>(1<<i); i++);
+ freq[i]++;
+ t++;
+ if(1024*1024*1024 % t == 0)
+ for(i=0; i<32; i++)
+ ;//printf("freq < %8d %4d\n", 1<<i, freq[i]);
+ }
+#endif
+#ifndef HAVE_MMX1
+ /* PREFETCH has effect even for MOVSB instruction ;) */
+ __asm__ __volatile__ (
+ PREFETCH" (%0)\n"
+ PREFETCH" 64(%0)\n"
+ PREFETCH" 128(%0)\n"
+ PREFETCH" 192(%0)\n"
+ PREFETCH" 256(%0)\n"
+ : : "r" (from) );
+#endif
+ if(len >= MIN_LEN)
+ {
+ register unsigned long int delta;
+ /* Align destinition to MMREG_SIZE -boundary */
+ delta = ((unsigned long int)to)&(MMREG_SIZE-1);
+ if(delta)
+ {
+ delta=MMREG_SIZE-delta;
+ len -= delta;
+ small_memcpy(to, from, delta);
+ }
+ i = len >> 6; /* len/64 */
+ len&=63;
+ /*
+ This algorithm is top effective when the code consequently
+ reads and writes blocks which have size of cache line.
+ Size of cache line is processor-dependent.
+ It will, however, be a minimum of 32 bytes on any processors.
+ It would be better to have a number of instructions which
+ perform reading and writing to be multiple to a number of
+ processor's decoders, but it's not always possible.
+ */
+#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
+ if(((unsigned long)from) & 15)
+ /* if SRC is misaligned */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ PREFETCH" 320(%0)\n"
+ "movups (%0), %%xmm0\n"
+ "movups 16(%0), %%xmm1\n"
+ "movups 32(%0), %%xmm2\n"
+ "movups 48(%0), %%xmm3\n"
+ "movntps %%xmm0, (%1)\n"
+ "movntps %%xmm1, 16(%1)\n"
+ "movntps %%xmm2, 32(%1)\n"
+ "movntps %%xmm3, 48(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from=((const unsigned char *) from)+64;
+ to=((unsigned char *)to)+64;
+ }
+ else
+ /*
+ Only if SRC is aligned on 16-byte boundary.
+ It allows to use movaps instead of movups, which required data
+ to be aligned or a general-protection exception (#GP) is generated.
+ */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ PREFETCH" 320(%0)\n"
+ "movaps (%0), %%xmm0\n"
+ "movaps 16(%0), %%xmm1\n"
+ "movaps 32(%0), %%xmm2\n"
+ "movaps 48(%0), %%xmm3\n"
+ "movntps %%xmm0, (%1)\n"
+ "movntps %%xmm1, 16(%1)\n"
+ "movntps %%xmm2, 32(%1)\n"
+ "movntps %%xmm3, 48(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from=((const unsigned char *)from)+64;
+ to=((unsigned char *)to)+64;
+ }
+#else
+ // Align destination at BLOCK_SIZE boundary
+ for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
+ {
+ __asm__ __volatile__ (
+#ifndef HAVE_MMX1
+ PREFETCH" 320(%0)\n"
+#endif
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ MOVNTQ" %%mm0, (%1)\n"
+ MOVNTQ" %%mm1, 8(%1)\n"
+ MOVNTQ" %%mm2, 16(%1)\n"
+ MOVNTQ" %%mm3, 24(%1)\n"
+ MOVNTQ" %%mm4, 32(%1)\n"
+ MOVNTQ" %%mm5, 40(%1)\n"
+ MOVNTQ" %%mm6, 48(%1)\n"
+ MOVNTQ" %%mm7, 56(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from=((const unsigned char *)from)+64;
+ to=((unsigned char *)to)+64;
+ }
+
+// printf(" %d %d\n", (int)from&1023, (int)to&1023);
+ // Pure Assembly cuz gcc is a bit unpredictable ;)
+ if(i>=BLOCK_SIZE/64)
+ asm volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ "movl (%0, %%"REG_a"), %%ebx \n\t"
+ "movl 32(%0, %%"REG_a"), %%ebx \n\t"
+ "movl 64(%0, %%"REG_a"), %%ebx \n\t"
+ "movl 96(%0, %%"REG_a"), %%ebx \n\t"
+ "add $128, %%"REG_a" \n\t"
+ "cmp %3, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+
+ "xor %%"REG_a", %%"REG_a" \n\t"
+
+ ".balign 16 \n\t"
+ "2: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0\n"
+ "movq 8(%0, %%"REG_a"), %%mm1\n"
+ "movq 16(%0, %%"REG_a"), %%mm2\n"
+ "movq 24(%0, %%"REG_a"), %%mm3\n"
+ "movq 32(%0, %%"REG_a"), %%mm4\n"
+ "movq 40(%0, %%"REG_a"), %%mm5\n"
+ "movq 48(%0, %%"REG_a"), %%mm6\n"
+ "movq 56(%0, %%"REG_a"), %%mm7\n"
+ MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
+ MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
+ MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
+ MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
+ MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
+ MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
+ MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
+ MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
+ "add $64, %%"REG_a" \n\t"
+ "cmp %3, %%"REG_a" \n\t"
+ "jb 2b \n\t"
+
+#if CONFUSION_FACTOR > 0
+ // a few percent speedup on out of order executing CPUs
+ "mov %5, %%"REG_a" \n\t"
+ "2: \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "dec %%"REG_a" \n\t"
+ " jnz 2b \n\t"
+#endif
+
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %0 \n\t"
+ "add %3, %1 \n\t"
+ "sub %4, %2 \n\t"
+ "cmp %4, %2 \n\t"
+ " jae 1b \n\t"
+ : "+r" (from), "+r" (to), "+r" (i)
+ : "r" ((stride_t)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
+ : "%"REG_a, "%ebx"
+ );
+
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+#ifndef HAVE_MMX1
+ PREFETCH" 320(%0)\n"
+#endif
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ MOVNTQ" %%mm0, (%1)\n"
+ MOVNTQ" %%mm1, 8(%1)\n"
+ MOVNTQ" %%mm2, 16(%1)\n"
+ MOVNTQ" %%mm3, 24(%1)\n"
+ MOVNTQ" %%mm4, 32(%1)\n"
+ MOVNTQ" %%mm5, 40(%1)\n"
+ MOVNTQ" %%mm6, 48(%1)\n"
+ MOVNTQ" %%mm7, 56(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from=((const unsigned char *)from)+64;
+ to=((unsigned char *)to)+64;
+ }
+
+#endif /* Have SSE */
+#ifdef HAVE_MMX2
+ /* since movntq is weakly-ordered, a "sfence"
+ * is needed to become ordered again. */
+ __asm__ __volatile__ ("sfence":::"memory");
+#endif
+#ifndef HAVE_SSE
+ /* enables to use FPU */
+ __asm__ __volatile__ (EMMS:::"memory");
+#endif
+ }
+ /*
+ * Now do the tail of the block
+ */
+ if(len) small_memcpy(to, from, len);
+ return retval;
+}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libvo/fastmemcpy.h b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/fastmemcpy.h
new file mode 100644
index 000000000..0dd2ff62c
--- /dev/null
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/fastmemcpy.h
@@ -0,0 +1,42 @@
+#ifndef __MPLAYER_MEMCPY
+#define __MPLAYER_MEMCPY
+
+#include <stddef.h>
+#include "../libswscale/config.h"
+#include "../libavutil/internal.h"
+
+#if defined(USE_FASTMEMCPY) && (HAVE_MMX || HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_SSE || HAVE_SSE2)
+extern void* (*fast_memcpy)(void * to, const void * from, size_t len);
+#define memcpy(a,b,c) fast_memcpy(a,b,c)
+#endif
+void init_fast_memcpy(void);
+
+static inline void * memcpy_pic(unsigned char * dst, unsigned char * src, int bytesPerLine, int height, int dstStride, int srcStride)
+{
+ int i;
+ void *retval=dst;
+
+ if(dstStride == srcStride)
+ {
+ if (srcStride < 0) {
+ src += (height-1)*srcStride;
+ dst += (height-1)*dstStride;
+ srcStride = -srcStride;
+ }
+
+ memcpy(dst, src, srcStride*height);
+ }
+ else
+ {
+ for(i=0; i<height; i++)
+ {
+ memcpy(dst, src, bytesPerLine);
+ src+= srcStride;
+ dst+= dstStride;
+ }
+ }
+
+ return retval;
+}
+
+#endif
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libvo/libvoinit.c b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/libvoinit.c
new file mode 100644
index 000000000..31d8d3a4e
--- /dev/null
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libvo/libvoinit.c
@@ -0,0 +1,22 @@
+#include "cpudetect.h"
+#include "../libswscale/config.h"
+#include <string.h>
+#include "fastmemcpy.h"
+#include "../compiler.h"
+
+CpuCaps gCpuCaps;
+
+void init_libvo()
+{
+ gCpuCaps.hasMMX=1;
+ gCpuCaps.hasMMX2=1;
+ gCpuCaps.has3DNow=1;
+ gCpuCaps.has3DNowExt=1;
+ gCpuCaps.hasSSE=1;
+ gCpuCaps.hasSSE2=1;
+ gCpuCaps.hasSSSE3=1;
+ init_fast_memcpy();
+ // Avoid using multithread if the CPU is Pentium4-HT
+ // because it is not faster at all and uses more CPU.
+ // (Swscaler depends much on MMX and P4HT have only one MMX unit.)
+}