Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/asmjit/asmjit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorPetr Kobalicek <kobalicek.petr@gmail.com>2017-01-26 17:55:03 +0300
committerGitHub <noreply@github.com>2017-01-26 17:55:03 +0300
commitb7f6d1e369b4b87006851ded9017d3f864ee9d4b (patch)
treeba63553fe9045310e9d39210152ee70d31299071 /test
parentfb9f82cb61df36aa513d054e748dc6769045f33e (diff)
Merged asmjit:next branch (#149)
Diffstat (limited to 'test')
-rw-r--r--test/asmjit.h1
-rw-r--r--test/asmjit_bench_x86.cpp146
-rw-r--r--test/asmjit_test_misc.h176
-rw-r--r--test/asmjit_test_opcode.cpp92
-rw-r--r--test/asmjit_test_opcode.h6046
-rw-r--r--test/asmjit_test_unit.cpp261
-rw-r--r--test/asmjit_test_x86_asm.cpp95
-rw-r--r--test/asmjit_test_x86_cc.cpp3429
-rw-r--r--test/broken.cpp278
-rw-r--r--test/broken.h145
10 files changed, 10669 insertions, 0 deletions
diff --git a/test/asmjit.h b/test/asmjit.h
new file mode 100644
index 0000000..36a7588
--- /dev/null
+++ b/test/asmjit.h
@@ -0,0 +1 @@
+#include "../src/asmjit/asmjit.h"
diff --git a/test/asmjit_bench_x86.cpp b/test/asmjit_bench_x86.cpp
new file mode 100644
index 0000000..6f52e8e
--- /dev/null
+++ b/test/asmjit_bench_x86.cpp
@@ -0,0 +1,146 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// [Dependencies]
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./asmjit.h"
+#include "./asmjit_test_misc.h"
+#include "./asmjit_test_opcode.h"
+
+using namespace asmjit;
+
+// ============================================================================
+// [Configuration]
+// ============================================================================
+
+static const uint32_t kNumRepeats = 10;
+static const uint32_t kNumIterations = 5000;
+
+// ============================================================================
+// [Performance]
+// ============================================================================
+
+struct Performance {
+ static inline uint32_t now() {
+ return OSUtils::getTickCount();
+ }
+
+ inline void reset() {
+ tick = 0;
+ best = 0xFFFFFFFF;
+ }
+
+ inline uint32_t start() { return (tick = now()); }
+ inline uint32_t diff() const { return now() - tick; }
+
+ inline uint32_t end() {
+ tick = diff();
+ if (best > tick)
+ best = tick;
+ return tick;
+ }
+
+ uint32_t tick;
+ uint32_t best;
+};
+
+static double mbps(uint32_t time, size_t outputSize) {
+ if (!time) return 0.0;
+
+ double bytesTotal = static_cast<double>(outputSize);
+ return (bytesTotal * 1000) / (static_cast<double>(time) * 1024 * 1024);
+}
+
+// ============================================================================
+// [Main]
+// ============================================================================
+
+#if defined(ASMJIT_BUILD_X86)
+static void benchX86(uint32_t archType) {
+ CodeHolder code;
+ Performance perf;
+
+ X86Assembler a;
+ X86Compiler cc;
+
+ uint32_t r, i;
+ const char* archName = archType == ArchInfo::kTypeX86 ? "X86" : "X64";
+
+ // --------------------------------------------------------------------------
+ // [Bench - Assembler]
+ // --------------------------------------------------------------------------
+
+ size_t asmOutputSize = 0;
+ size_t cmpOutputSize = 0;
+
+ perf.reset();
+ for (r = 0; r < kNumRepeats; r++) {
+ asmOutputSize = 0;
+ perf.start();
+ for (i = 0; i < kNumIterations; i++) {
+ code.init(CodeInfo(archType));
+ code.attach(&a);
+
+ asmtest::generateOpcodes(a);
+ asmOutputSize += code.getCodeSize();
+
+ code.reset(false); // Detaches `a`.
+ }
+ perf.end();
+ }
+
+ printf("%-12s (%s) | Time: %-6u [ms] | Speed: %7.3f [MB/s]\n",
+ "X86Assembler", archName, perf.best, mbps(perf.best, asmOutputSize));
+
+ // --------------------------------------------------------------------------
+ // [Bench - CodeBuilder]
+ // --------------------------------------------------------------------------
+
+ // TODO:
+
+ // --------------------------------------------------------------------------
+ // [Bench - CodeCompiler]
+ // --------------------------------------------------------------------------
+
+ perf.reset();
+ for (r = 0; r < kNumRepeats; r++) {
+ cmpOutputSize = 0;
+ perf.start();
+ for (i = 0; i < kNumIterations; i++) {
+ // NOTE: Since we don't have JitRuntime we don't know anything about
+ // function calling conventions, which is required by generateAlphaBlend.
+ // So we must setup this manually.
+ CodeInfo ci(archType);
+ ci.setCdeclCallConv(archType == ArchInfo::kTypeX86 ? CallConv::kIdX86CDecl : CallConv::kIdX86SysV64);
+
+ code.init(ci);
+ code.attach(&cc);
+
+ asmtest::generateAlphaBlend(cc);
+ cc.finalize();
+ cmpOutputSize += code.getCodeSize();
+
+ code.reset(false); // Detaches `cc`.
+ }
+ perf.end();
+ }
+
+ printf("%-12s (%s) | Time: %-6u [ms] | Speed: %7.3f [MB/s]\n",
+ "X86Compiler", archName, perf.best, mbps(perf.best, cmpOutputSize));
+}
+#endif
+
+int main(int argc, char* argv[]) {
+#if defined(ASMJIT_BUILD_X86)
+ benchX86(ArchInfo::kTypeX86);
+ benchX86(ArchInfo::kTypeX64);
+#endif // ASMJIT_BUILD_X86
+
+ return 0;
+}
diff --git a/test/asmjit_test_misc.h b/test/asmjit_test_misc.h
new file mode 100644
index 0000000..b460b50
--- /dev/null
+++ b/test/asmjit_test_misc.h
@@ -0,0 +1,176 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// [Guard]
+#ifndef _ASMJIT_TEST_MISC_H
+#define _ASMJIT_TEST_MISC_H
+
+// [Dependencies]
+#include "./asmjit.h"
+
+namespace asmtest {
+
+// Generate a typical alpha blend function using SSE2 instruction set. Used
+// for benchmarking and also in test86. The generated code should be stable
+// and fully functional.
+static void generateAlphaBlend(asmjit::X86Compiler& cc) {
+ using namespace asmjit;
+ using namespace asmjit::x86;
+
+ X86Gp dst = cc.newIntPtr("dst");
+ X86Gp src = cc.newIntPtr("src");
+
+ X86Gp i = cc.newIntPtr("i");
+ X86Gp j = cc.newIntPtr("j");
+ X86Gp t = cc.newIntPtr("t");
+
+ X86Xmm x0 = cc.newXmm("x0");
+ X86Xmm x1 = cc.newXmm("x1");
+ X86Xmm y0 = cc.newXmm("y0");
+ X86Xmm a0 = cc.newXmm("a0");
+ X86Xmm a1 = cc.newXmm("a1");
+
+ X86Xmm cZero = cc.newXmm("cZero");
+ X86Xmm cMul255A = cc.newXmm("cMul255A");
+ X86Xmm cMul255M = cc.newXmm("cMul255M");
+
+ Label L_SmallLoop = cc.newLabel();
+ Label L_SmallEnd = cc.newLabel();
+ Label L_LargeLoop = cc.newLabel();
+ Label L_LargeEnd = cc.newLabel();
+ Label L_DataPool = cc.newLabel();
+
+ cc.addFunc(FuncSignature3<void, void*, const void*, size_t>(cc.getCodeInfo().getCdeclCallConv()));
+
+ cc.setArg(0, dst);
+ cc.setArg(1, src);
+ cc.setArg(2, i);
+
+ cc.alloc(dst);
+ cc.alloc(src);
+ cc.alloc(i);
+
+ // How many pixels have to be processed to make the loop aligned.
+ cc.lea(t, ptr(L_DataPool));
+ cc.xor_(j, j);
+ cc.xorps(cZero, cZero);
+
+ cc.sub(j, dst);
+ cc.movaps(cMul255A, ptr(t, 0));
+
+ cc.and_(j, 15);
+ cc.movaps(cMul255M, ptr(t, 16));
+
+ cc.shr(j, 2);
+ cc.jz(L_SmallEnd);
+
+ // j = min(i, j).
+ cc.cmp(j, i);
+ cc.cmovg(j, i);
+
+ // i -= j.
+ cc.sub(i, j);
+
+ // Small loop.
+ cc.bind(L_SmallLoop);
+
+ cc.pcmpeqb(a0, a0);
+ cc.movd(y0, ptr(src));
+
+ cc.pxor(a0, y0);
+ cc.movd(x0, ptr(dst));
+
+ cc.psrlw(a0, 8);
+ cc.punpcklbw(x0, cZero);
+
+ cc.pshuflw(a0, a0, x86::shufImm(1, 1, 1, 1));
+ cc.punpcklbw(y0, cZero);
+
+ cc.pmullw(x0, a0);
+ cc.paddsw(x0, cMul255A);
+ cc.pmulhuw(x0, cMul255M);
+
+ cc.paddw(x0, y0);
+ cc.packuswb(x0, x0);
+
+ cc.movd(ptr(dst), x0);
+
+ cc.add(dst, 4);
+ cc.add(src, 4);
+
+ cc.dec(j);
+ cc.jnz(L_SmallLoop);
+
+ // Second section, prepare for an aligned loop.
+ cc.bind(L_SmallEnd);
+
+ cc.test(i, i);
+ cc.mov(j, i);
+ cc.jz(cc.getFunc()->getExitLabel());
+
+ cc.and_(j, 3);
+ cc.shr(i, 2);
+ cc.jz(L_LargeEnd);
+
+ // Aligned loop.
+ cc.bind(L_LargeLoop);
+
+ cc.movups(y0, ptr(src));
+ cc.pcmpeqb(a0, a0);
+ cc.movaps(x0, ptr(dst));
+
+ cc.xorps(a0, y0);
+ cc.movaps(x1, x0);
+
+ cc.psrlw(a0, 8);
+ cc.punpcklbw(x0, cZero);
+
+ cc.movaps(a1, a0);
+ cc.punpcklwd(a0, a0);
+
+ cc.punpckhbw(x1, cZero);
+ cc.punpckhwd(a1, a1);
+
+ cc.pshufd(a0, a0, x86::shufImm(3, 3, 1, 1));
+ cc.pshufd(a1, a1, x86::shufImm(3, 3, 1, 1));
+
+ cc.pmullw(x0, a0);
+ cc.pmullw(x1, a1);
+
+ cc.paddsw(x0, cMul255A);
+ cc.paddsw(x1, cMul255A);
+
+ cc.pmulhuw(x0, cMul255M);
+ cc.pmulhuw(x1, cMul255M);
+
+ cc.add(src, 16);
+ cc.packuswb(x0, x1);
+
+ cc.paddw(x0, y0);
+ cc.movaps(ptr(dst), x0);
+
+ cc.add(dst, 16);
+
+ cc.dec(i);
+ cc.jnz(L_LargeLoop);
+
+ cc.bind(L_LargeEnd);
+ cc.test(j, j);
+ cc.jnz(L_SmallLoop);
+
+ cc.endFunc();
+
+ // Data.
+ cc.align(kAlignData, 16);
+ cc.bind(L_DataPool);
+ cc.dxmm(Data128::fromI16(0x0080));
+ cc.dxmm(Data128::fromI16(0x0101));
+}
+
+} // asmtest namespace
+
+// [Guard]
+#endif // _ASMJIT_TEST_MISC_H
diff --git a/test/asmjit_test_opcode.cpp b/test/asmjit_test_opcode.cpp
new file mode 100644
index 0000000..86cfba1
--- /dev/null
+++ b/test/asmjit_test_opcode.cpp
@@ -0,0 +1,92 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// This file is used to test opcodes generated by AsmJit. Output can be
+// disassembled in your IDE or by your favorite disassembler. Instructions
+// are grouped by category and then sorted alphabetically.
+
+// [Dependencies]
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./asmjit.h"
+#include "./asmjit_test_opcode.h"
+
+using namespace asmjit;
+
+struct OpcodeDumpInfo {
+ uint32_t archType;
+ bool useRex1;
+ bool useRex2;
+};
+
+static const char* archTypeToString(uint32_t archType) {
+ switch (archType) {
+ case ArchInfo::kTypeNone : return "None";
+ case ArchInfo::kTypeX86 : return "X86";
+ case ArchInfo::kTypeX64 : return "X64";
+ case ArchInfo::kTypeA32 : return "A32";
+ case ArchInfo::kTypeA64 : return "A64";
+
+ default:
+ return "<unknown>";
+ }
+}
+
+struct TestErrorHandler : public ErrorHandler {
+ virtual bool handleError(Error err, const char* message, CodeEmitter* origin) {
+ printf("ERROR 0x%08X: %s\n", err, message);
+ return true;
+ }
+};
+
+typedef void (*VoidFunc)(void);
+
+int main(int argc, char* argv[]) {
+ TestErrorHandler eh;
+
+ OpcodeDumpInfo infoList[] = {
+ { ArchInfo::kTypeX86, false, false },
+ { ArchInfo::kTypeX64, false, false },
+ { ArchInfo::kTypeX64, false, true },
+ { ArchInfo::kTypeX64, true , false },
+ { ArchInfo::kTypeX64, true , true }
+ };
+
+ for (int i = 0; i < ASMJIT_ARRAY_SIZE(infoList); i++) {
+ const OpcodeDumpInfo& info = infoList[i];
+
+ printf("Opcodes [ARCH=%s REX1=%s REX2=%s]\n",
+ archTypeToString(info.archType),
+ info.useRex1 ? "true" : "false",
+ info.useRex2 ? "true" : "false");
+
+ CodeHolder code;
+ code.init(CodeInfo(info.archType));
+ code.setErrorHandler(&eh);
+
+#if !defined(ASMJIT_DISABLE_LOGGING)
+ FileLogger logger(stdout);
+ logger.addOptions(Logger::kOptionBinaryForm);
+ code.setLogger(&logger);
+#endif // ASMJIT_DISABLE_LOGGING
+
+ X86Assembler a(&code);
+ asmtest::generateOpcodes(a, info.useRex1, info.useRex2);
+
+ // If this is the host architecture the code generated can be executed
+ // for debugging purposes (the first instruction is ret anyway).
+ if (code.getArchType() == ArchInfo::kTypeHost) {
+ JitRuntime runtime;
+ VoidFunc p;
+ Error err = runtime.add(&p, &code);
+ if (err == kErrorOk) p();
+ }
+ }
+
+ return 0;
+}
diff --git a/test/asmjit_test_opcode.h b/test/asmjit_test_opcode.h
new file mode 100644
index 0000000..63be387
--- /dev/null
+++ b/test/asmjit_test_opcode.h
@@ -0,0 +1,6046 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// [Guard]
+#ifndef _ASMJIT_TEST_OPCODE_H
+#define _ASMJIT_TEST_OPCODE_H
+
+// [Dependencies]
+#include "./asmjit.h"
+
+namespace asmtest {
+
+// Generate all instructions asmjit can emit.
+static void generateOpcodes(asmjit::X86Assembler& a, bool useRex1 = false, bool useRex2 = false) {
+ using namespace asmjit;
+ using namespace asmjit::x86;
+
+ bool isX64 = a.is64Bit();
+
+ /*
+ // TODO: Finalize implicit vs explicit.
+ a.cmpxchg8b(ptr_gpC);
+ a.cmpxchg8b(ptr_gpC, x86::edx, x86::eax, x86::ecx, x86::ebx);
+
+ if (isX64) a.cmpxchg16b(ptr_gpC);
+ if (isX64) a.cmpxchg16b(ptr_gpC, x86::rdx, x86::rax, x86::rcx, x86::rbx);
+ */
+
+ // Prevent crash when the generated function is called to see the disassembly.
+ a.ret();
+
+ // All instructions use the following register that can be changed to see if
+ // the `X86Assembler` is properly encoding all possible combinations. If the
+ // `useRexRegs` argument is true the `A` version will in most cases contain
+ // a register having index 8 (if encodable).
+ X86Gp gLoA = useRex1 ? r8b : al;
+ X86Gp gLoB = useRex2 ? r9b : bl;
+
+ X86Gp gHiA = ah;
+ X86Gp gHiB = bh;
+
+ X86Gp gwA = useRex1 ? r8w : ax;
+ X86Gp gwB = useRex2 ? r9w : bx;
+
+ X86Gp gdA = useRex1 ? r8d : eax;
+ X86Gp gdB = useRex2 ? r9d : ebx;
+ X86Gp gdC = useRex2 ? r10d : ecx;
+
+ X86Gp gzA = useRex1 ? r8 : a.zax();
+ X86Gp gzB = useRex2 ? r9 : a.zbx();
+ X86Gp gzC = useRex2 ? r10 : a.zcx();
+ X86Gp gzD = useRex2 ? r11 : a.zdx();
+
+ X86KReg kA = k1;
+ X86KReg kB = k2;
+ X86KReg kC = k3;
+
+ X86Mem anyptr_gpA = ptr(gzA);
+ X86Mem anyptr_gpB = ptr(gzB);
+ X86Mem anyptr_gpC = ptr(gzC);
+ X86Mem anyptr_gpD = ptr(gzD);
+
+ X86Mem intptr_gpA = a.intptr_ptr(gzA);
+ X86Mem intptr_gpB = a.intptr_ptr(gzB);
+
+ X86Fp fpA = fp0;
+ X86Fp fpB = fp7;
+
+ X86Mm mmA = mm0;
+ X86Mm mmB = mm1;
+
+ X86Xmm xmmA = useRex1 ? xmm8 : xmm0;
+ X86Xmm xmmB = useRex2 ? xmm9 : xmm1;
+ X86Xmm xmmC = useRex2 ? xmm10 : xmm2;
+ X86Xmm xmmD = useRex2 ? xmm11 : xmm3;
+
+ X86Ymm ymmA = useRex1 ? ymm8 : ymm0;
+ X86Ymm ymmB = useRex2 ? ymm9 : ymm1;
+ X86Ymm ymmC = useRex2 ? ymm10 : ymm2;
+ X86Ymm ymmD = useRex2 ? ymm11 : ymm3;
+
+ X86Zmm zmmA = useRex1 ? zmm8 : zmm0;
+ X86Zmm zmmB = useRex2 ? zmm9 : zmm1;
+ X86Zmm zmmC = useRex2 ? zmm10 : zmm2;
+ X86Zmm zmmD = useRex2 ? zmm11 : zmm3;
+
+ X86Mem vx_ptr = ptr(gzB, xmmB);
+ X86Mem vy_ptr = ptr(gzB, ymmB);
+ X86Mem vz_ptr = ptr(gzB, zmmB);
+
+ Label L;
+
+ // Base.
+ a.adc(gLoA, 1);
+ a.adc(gLoB, 1);
+ a.adc(gHiA, 1);
+ a.adc(gHiB, 1);
+ a.adc(gwA, 1);
+ a.adc(gwB, 1);
+ a.adc(gdA, 1);
+ a.adc(gdB, 1);
+ a.adc(gzA, 1);
+ a.adc(gzA, gzB);
+ a.adc(gzA, intptr_gpB);
+ a.adc(intptr_gpA, 1);
+ a.adc(intptr_gpA, gzB);
+ a.add(gLoA, 1);
+ a.add(gLoB, 1);
+ a.add(gHiA, 1);
+ a.add(gHiB, 1);
+ a.add(gwA, 1);
+ a.add(gwB, 1);
+ a.add(gdA, 1);
+ a.add(gdB, 1);
+ a.add(gzA, 1);
+ a.add(gzA, gzB);
+ a.add(gzA, intptr_gpB);
+ a.add(intptr_gpA, 1);
+ a.add(intptr_gpA, gzB);
+ a.and_(gLoA, 1);
+ a.and_(gLoB, 1);
+ a.and_(gHiA, 1);
+ a.and_(gHiB, 1);
+ a.and_(gwA, 1);
+ a.and_(gwB, 1);
+ a.and_(gdA, 1);
+ a.and_(gdB, 1);
+ a.and_(gzA, 1);
+ a.and_(gzA, gzB);
+ a.and_(gzA, intptr_gpB);
+ a.and_(intptr_gpA, 1);
+ a.and_(intptr_gpA, gzB);
+ a.bswap(gzA);
+ a.bt(gdA, 1);
+ a.bt(gzA, 1);
+ a.bt(gdA, gdB);
+ a.bt(gzA, gzB);
+ a.bt(intptr_gpA, 1);
+ a.bt(anyptr_gpA, gdB);
+ a.bt(intptr_gpA, gzB);
+ a.btc(gdA, 1);
+ a.btc(gzA, 1);
+ a.btc(gdA, gdB);
+ a.btc(gzA, gzB);
+ a.btc(intptr_gpA, 1);
+ a.btc(anyptr_gpA, gdB);
+ a.btc(intptr_gpA, gzB);
+ a.btr(gdA, 1);
+ a.btr(gzA, 1);
+ a.btr(gdA, gdB);
+ a.btr(gzA, gzB);
+ a.btr(intptr_gpA, 1);
+ a.btr(anyptr_gpA, gdB);
+ a.btr(intptr_gpA, gzB);
+ a.bts(gdA, 1);
+ a.bts(gzA, 1);
+ a.bts(gdA, gdB);
+ a.bts(gzA, gzB);
+ a.bts(intptr_gpA, 1);
+ a.bts(anyptr_gpA, gdB);
+ a.bts(intptr_gpA, gzB);
+ a.call(gzA);
+ a.call(intptr_gpA);
+ a.cbw(); // Implicit AX <- Sign Extend AL.
+ a.cbw(ax); // Explicit AX <- Sign Extend AL.
+ a.cdq(); // Implicit EDX:EAX <- Sign Extend EAX.
+ a.cdq(edx, eax); // Explicit EDX:EAX <- Sign Extend EAX.
+ if (isX64) a.cdqe(); // Implicit RAX <- Sign Extend EAX.
+ if (isX64) a.cdqe(eax); // Explicit RAX <- Sign Extend EAX.
+ a.cwd(); // Implicit DX:AX <- Sign Extend AX.
+ a.cwd(dx, ax); // Explicit DX:AX <- Sign Extend AX.
+ a.cwde(); // Implicit EAX <- Sign Extend AX.
+ a.cwde(eax); // Explicit EAX <- Sign Extend AX.
+ if (isX64) a.cqo(); // Implicit RDX:RAX <- Sign Extend RAX.
+ if (isX64) a.cqo(rdx, rax); // Explicit RDX:RAX <- Sign Extend RAX.
+ a.clc();
+ a.cld();
+ a.cmc();
+ a.cmp(gLoA, 1);
+ a.cmp(gLoB, 1);
+ a.cmp(gHiA, 1);
+ a.cmp(gHiB, 1);
+ a.cmp(gwA, 1);
+ a.cmp(gwB, 1);
+ a.cmp(gdA, 1);
+ a.cmp(gdB, 1);
+ a.cmp(gzA, 1);
+ a.cmp(gLoA, gLoB);
+ a.cmp(gHiA, gHiB);
+ a.cmp(gwA, gwB);
+ a.cmp(gdA, gdB);
+ a.cmp(gzA, gzB);
+ a.cmp(gdA, anyptr_gpB);
+ a.cmp(gzA, intptr_gpB);
+ a.cmp(intptr_gpA, 1);
+ a.cmp(anyptr_gpA, gdB);
+ a.cmp(intptr_gpA, gzB);
+ a.cmpxchg(gdA, gdB); // Implicit regA, regB, <EAX>
+ a.cmpxchg(gzA, gzB); // Implicit regA, regB, <ZAX>
+ a.cmpxchg(gdA, gdB, eax); // Explicit regA, regB, <EAX>
+ a.cmpxchg(gzA, gzB, a.zax()); // Explicit regA, regB, <ZAX>
+ a.cmpxchg(anyptr_gpA, gdB); // Implicit mem , regB, <EAX>
+ a.cmpxchg(anyptr_gpA, gzB); // Implicit mem , regB, <ZAX>
+ a.cmpxchg(anyptr_gpA, gdB, eax); // Explicit mem , regB, <EAX>
+ a.cmpxchg(anyptr_gpA, gzB, a.zax()); // Explicit mem , regB, <ZAX>
+ a.cmpxchg8b(anyptr_gpA); // Implicit mem , <EDX>, <EAX>, <ECX>, <EBX>
+ if (isX64) a.cmpxchg16b(anyptr_gpA); // Implicit mem , <RDX>, <RAX>, <RCX>, <RBX>
+ a.cpuid(); // Implicit <EAX>, <EBX>, <ECX>, <EDX>
+ a.cpuid(eax, ebx, ecx, edx); // Explicit <EAX>, <EBX>, <ECX>, <EDX>
+ a.crc32(gdA, byte_ptr(gzB));
+ a.crc32(gdA, word_ptr(gzB));
+ a.crc32(gdA, dword_ptr(gzB));
+ if (isX64) a.crc32(gdA, qword_ptr(gzB));
+ if (isX64) a.crc32(gzA, qword_ptr(gzB));
+ a.dec(gLoA);
+ a.dec(gHiA);
+ a.dec(gwA);
+ a.dec(gdA);
+ a.dec(gzA);
+ a.dec(intptr_gpA);
+ a.inc(gLoA);
+ a.inc(gwA);
+ a.inc(gdA);
+ a.inc(gzA);
+ a.inc(intptr_gpA);
+ a.int_(13);
+ a.int3();
+ a.into();
+ a.lea(gzA, intptr_gpB);
+ a.mov(gLoA, 1);
+ a.mov(gHiA, 1);
+ a.mov(gwA, 1);
+ a.mov(gdA, 1);
+ a.mov(gzA, 1);
+ a.mov(gLoA, gLoB);
+ a.mov(gHiA, gHiB);
+ a.mov(gwA, gwB);
+ a.mov(gdA, gdB);
+ a.mov(gzA, gzB);
+ a.mov(gLoA, anyptr_gpB);
+ a.mov(gwA, anyptr_gpB);
+ a.mov(gdA, anyptr_gpB);
+ a.mov(gzA, intptr_gpB);
+ a.mov(anyptr_gpA, gLoB);
+ a.mov(anyptr_gpA, gwB);
+ a.mov(anyptr_gpA, gdB);
+ a.mov(intptr_gpA, 1);
+ a.mov(intptr_gpA, gzB);
+ a.movsx(gzA, gLoB);
+ a.movsx(gzA, byte_ptr(gzB));
+ a.movzx(gzA, gLoB);
+ a.movzx(gzA, byte_ptr(gzB));
+ a.movbe(gzA, anyptr_gpB);
+ a.movbe(anyptr_gpA, gzB);
+ a.neg(gzA);
+ a.neg(intptr_gpA);
+ a.nop();
+ a.not_(gzA);
+ a.not_(intptr_gpA);
+ a.or_(gLoA, 1);
+ a.or_(gLoB, 1);
+ a.or_(gHiA, 1);
+ a.or_(gHiB, 1);
+ a.or_(gwA, 1);
+ a.or_(gwB, 1);
+ a.or_(gdA, 1);
+ a.or_(gdB, 1);
+ a.or_(gzA, 1);
+ a.or_(gzA, gzB);
+ a.or_(gzA, intptr_gpB);
+ a.or_(intptr_gpA, 1);
+ a.or_(intptr_gpA, gzB);
+ a.pop(gzA);
+ a.pop(intptr_gpA);
+ if (!isX64) a.popa();
+ if (!isX64) a.popad();
+ a.popf();
+ if (!isX64) a.popfd();
+ if ( isX64) a.popfq();
+ a.push(gzA);
+ a.push(intptr_gpA);
+ a.push(0);
+ if (!isX64) a.pusha();
+ if (!isX64) a.pushad();
+ a.pushf();
+ if (!isX64) a.pushfd();
+ if ( isX64) a.pushfq();
+ a.rcl(gdA, 0);
+ a.rcl(gzA, 0);
+ a.rcl(gdA, 1);
+ a.rcl(gzA, 1);
+ a.rcl(gdA, cl);
+ a.rcl(gzA, cl);
+ a.rcl(intptr_gpA, 0);
+ a.rcl(intptr_gpA, 1);
+ a.rcl(intptr_gpA, cl);
+ a.rcr(gdA, 0);
+ a.rcr(gzA, 0);
+ a.rcr(gdA, 1);
+ a.rcr(gzA, 1);
+ a.rcr(gdA, cl);
+ a.rcr(gzA, cl);
+ a.rcr(intptr_gpA, 0);
+ a.rcr(intptr_gpA, 1);
+ a.rcr(intptr_gpA, cl);
+ a.rdtsc(); // Implicit <EDX:EAX>
+ a.rdtsc(edx, eax); // Explicit <EDX:EAX>
+ a.rdtscp(); // Implicit <EDX:EAX>, <ECX>
+ a.rdtscp(edx, eax, ecx); // Implicit <EDX:EAX>, <ECX>
+ a.ret();
+ a.ret(0);
+ a.rol(gdA, 0);
+ a.rol(gzA, 0);
+ a.rol(gdA, 1);
+ a.rol(gzA, 1);
+ a.rol(gdA, cl);
+ a.rol(gzA, cl);
+ a.rol(intptr_gpA, 0);
+ a.rol(intptr_gpA, 1);
+ a.rol(intptr_gpA, cl);
+ a.ror(gdA, 0);
+ a.ror(gzA, 0);
+ a.ror(gdA, 1);
+ a.ror(gzA, 1);
+ a.ror(gdA, cl);
+ a.ror(gzA, cl);
+ a.ror(intptr_gpA, 0);
+ a.ror(intptr_gpA, 1);
+ a.ror(intptr_gpA, cl);
+ a.sbb(gLoA, 1);
+ a.sbb(gLoB, 1);
+ a.sbb(gHiA, 1);
+ a.sbb(gHiB, 1);
+ a.sbb(gwA, 1);
+ a.sbb(gwB, 1);
+ a.sbb(gdA, 1);
+ a.sbb(gdB, 1);
+ a.sbb(gzA, 1);
+ a.sbb(gzA, gzB);
+ a.sbb(gzA, intptr_gpB);
+ a.sbb(intptr_gpA, 1);
+ a.sbb(intptr_gpA, gzB);
+ a.sal(gdA, 0);
+ a.sal(gzA, 0);
+ a.sal(gdA, 1);
+ a.sal(gzA, 1);
+ a.sal(gdA, cl);
+ a.sal(gzA, cl);
+ a.sal(intptr_gpA, 0);
+ a.sal(intptr_gpA, 1);
+ a.sal(intptr_gpA, cl);
+ a.sar(gdA, 0);
+ a.sar(gzA, 0);
+ a.sar(gdA, 1);
+ a.sar(gzA, 1);
+ a.sar(gdA, cl);
+ a.sar(gzA, cl);
+ a.sar(intptr_gpA, 0);
+ a.sar(intptr_gpA, 1);
+ a.sar(intptr_gpA, cl);
+ a.shl(gdA, 0);
+ a.shl(gzA, 0);
+ a.shl(gdA, 1);
+ a.shl(gzA, 1);
+ a.shl(gdA, cl);
+ a.shl(gzA, cl);
+ a.shl(intptr_gpA, 0);
+ a.shl(intptr_gpA, 1);
+ a.shl(intptr_gpA, cl);
+ a.shr(gdA, 0);
+ a.shr(gzA, 0);
+ a.shr(gdA, 1);
+ a.shr(gzA, 1);
+ a.shr(gdA, cl);
+ a.shr(gzA, cl);
+ a.shr(intptr_gpA, 0);
+ a.shr(intptr_gpA, 1);
+ a.shr(intptr_gpA, cl);
+ a.shld(gdA, gdB, 0);
+ a.shld(gzA, gzB, 0);
+ a.shld(gdA, gdB, cl);
+ a.shld(gzA, gzB, cl);
+ a.shld(anyptr_gpA, gdB, 0);
+ a.shld(intptr_gpA, gzB, 0);
+ a.shld(anyptr_gpA, gdB, cl);
+ a.shld(intptr_gpA, gzB, cl);
+ a.shrd(gdA, gdB, 0);
+ a.shrd(gzA, gzB, 0);
+ a.shrd(gdA, gdB, cl);
+ a.shrd(gzA, gzB, cl);
+ a.shrd(anyptr_gpA, gdB, 0);
+ a.shrd(intptr_gpA, gzB, 0);
+ a.shrd(anyptr_gpA, gdB, cl);
+ a.shrd(intptr_gpA, gzB, cl);
+ a.stc();
+ a.std();
+ a.sti();
+ a.sub(gLoA, 1);
+ a.sub(gLoB, 1);
+ a.sub(gHiA, 1);
+ a.sub(gHiB, 1);
+ a.sub(gwA, 1);
+ a.sub(gwB, 1);
+ a.sub(gdA, 1);
+ a.sub(gdB, 1);
+ a.sub(gzA, 1);
+ a.sub(gzA, gzB);
+ a.sub(gzA, intptr_gpB);
+ a.sub(intptr_gpA, 1);
+ a.sub(intptr_gpA, gzB);
+ a.swapgs();
+ a.test(gzA, 1);
+ a.test(gzA, gzB);
+ a.test(intptr_gpA, 1);
+ a.test(intptr_gpA, gzB);
+ a.ud2();
+ a.xadd(gzA, gzB);
+ a.xadd(intptr_gpA, gzB);
+ a.xchg(gzA, gzB);
+ a.xchg(intptr_gpA, gzB);
+ a.xchg(gzA, intptr_gpB);
+ a.xor_(gLoA, 1);
+ a.xor_(gLoB, 1);
+ a.xor_(gHiA, 1);
+ a.xor_(gHiB, 1);
+ a.xor_(gwA, 1);
+ a.xor_(gwB, 1);
+ a.xor_(gdA, 1);
+ a.xor_(gdB, 1);
+ a.xor_(gzA, 1);
+ a.xor_(gzA, gzB);
+ a.xor_(gzA, intptr_gpB);
+ a.xor_(intptr_gpA, 1);
+ a.xor_(intptr_gpA, gzB);
+
+ // Special case - div|mul.
+ a.div(cl); // Implicit AH:AL <- AX * r8
+ a.div(byte_ptr(gzA)); // Implicit AH:AL <- AX * m8
+ a.div(ax, cl); // Explicit AH:AL <- AX * r8
+ a.div(ax, anyptr_gpA); // Explicit AH:AL <- AX * m8
+
+ a.div(cx); // Implicit DX:AX <- DX:AX * r16
+ a.div(word_ptr(gzA)); // Implicit DX:AX <- DX:AX * m16
+ a.div(dx, ax, cx); // Explicit DX:AX <- DX:AX * r16
+ a.div(dx, ax, anyptr_gpA); // Explicit DX:AX <- DX:AX * m16
+
+ a.div(ecx); // Implicit EDX:EAX <- EDX:EAX * r32
+ a.div(dword_ptr(gzA)); // Implicit EDX:EAX <- EDX:EAX * m32
+ a.div(edx, eax, ecx); // Explicit EDX:EAX <- EDX:EAX * r32
+ a.div(edx, eax, anyptr_gpA); // Explicit EDX:EAX <- EDX:EAX * m32
+
+ if (isX64) a.div(rcx); // Implicit RDX|RAX <- RDX:RAX * r64
+ if (isX64) a.div(qword_ptr(gzA)); // Implicit RDX|RAX <- RDX:RAX * m64
+ if (isX64) a.div(rdx, rax, rcx); // Explicit RDX|RAX <- RDX:RAX * r64
+ if (isX64) a.div(rdx, rax, anyptr_gpA); // Explicit RDX|RAX <- RDX:RAX * m64
+
+ a.idiv(cl); // Implicit AH:AL <- AX * r8
+ a.idiv(byte_ptr(gzA)); // Implicit AH:AL <- AX * m8
+ a.idiv(ax, cl); // Explicit AH:AL <- AX * r8
+ a.idiv(ax, anyptr_gpA); // Explicit AH:AL <- AX * m8
+
+ a.idiv(cx); // Implicit DX:AX <- DX:AX * r16
+ a.idiv(word_ptr(gzA)); // Implicit DX:AX <- DX:AX * m16
+ a.idiv(dx, ax, cx); // Explicit DX:AX <- DX:AX * r16
+ a.idiv(dx, ax, anyptr_gpA); // Explicit DX:AX <- DX:AX * m16
+
+ a.idiv(ecx); // Implicit EDX:EAX <- EDX:EAX * r32
+ a.idiv(dword_ptr(gzA)); // Implicit EDX:EAX <- EDX:EAX * m32
+ a.idiv(edx, eax, ecx); // Explicit EDX:EAX <- EDX:EAX * r32
+ a.idiv(edx, eax, anyptr_gpA); // Explicit EDX:EAX <- EDX:EAX * m32
+
+ if (isX64) a.idiv(rcx); // Implicit RDX|RAX <- RDX:RAX * r64
+ if (isX64) a.idiv(qword_ptr(gzA)); // Implicit RDX|RAX <- RDX:RAX * m64
+ if (isX64) a.idiv(rdx, rax, rcx); // Explicit RDX|RAX <- RDX:RAX * r64
+ if (isX64) a.idiv(rdx, rax, anyptr_gpA); // Explicit RDX|RAX <- RDX:RAX * m64
+
+ a.mul(cl); // Implicit AX <- AL * r8
+ a.mul(byte_ptr(gzA)); // Implicit AX <- AL * m8
+ a.mul(ax, cl); // Explicit AX <- AL * r8
+ a.mul(ax, anyptr_gpA); // Explicit AX <- AL * m8
+
+ a.mul(cx); // Implicit DX:AX <- AX * r16
+ a.mul(word_ptr(gzA)); // Implicit DX:AX <- AX * m16
+ a.mul(dx, ax, cx); // Explicit DX:AX <- AX * r16
+ a.mul(dx, ax, anyptr_gpA); // Explicit DX:AX <- AX * m16
+
+ a.mul(ecx); // Implicit EDX:EAX <- EAX * r32
+ a.mul(dword_ptr(gzA)); // Implicit EDX:EAX <- EAX * m32
+ a.mul(edx, eax, ecx); // Explicit EDX:EAX <- EAX * r32
+ a.mul(edx, eax, anyptr_gpA); // Explicit EDX:EAX <- EAX * m32
+
+ if (isX64) a.mul(rcx); // Implicit RDX|RAX <- RAX * r64
+ if (isX64) a.mul(qword_ptr(gzA)); // Implicit RDX|RAX <- RAX * m64
+ if (isX64) a.mul(rdx, rax, rcx); // Explicit RDX|RAX <- RAX * r64
+ if (isX64) a.mul(rdx, rax, anyptr_gpA); // Explicit RDX|RAX <- RAX * m64
+
+ a.imul(gdA);
+ a.imul(gzA);
+ a.imul(intptr_gpA);
+ a.imul(gdA, 1);
+ a.imul(gzA, 1);
+ a.imul(gdA, gdB);
+ a.imul(gzA, gzB);
+ a.imul(gdA, gdB, 1);
+ a.imul(gzA, gzB, 1);
+ a.imul(gdA, anyptr_gpB);
+ a.imul(gzA, intptr_gpB);
+ a.imul(gdA, anyptr_gpB, 1);
+ a.imul(gzA, intptr_gpB, 1);
+
+ // Special case - zero-extend 32-bit immediate instead of sign-extend:
+ if (isX64) a.mov(gzA, static_cast<uint32_t>(0xFEEDFEED));
+ if (isX64) a.and_(gzA, static_cast<uint32_t>(0xFEEDFEED));
+
+ // Special case - mov with absolute 32-bit address (X86|X64).
+ a.mov(al , ptr(uint64_t(0x01020304)));
+ a.mov(ax , ptr(uint64_t(0x01020304)));
+ a.mov(eax, ptr(uint64_t(0x01020304)));
+ a.mov(ptr(uint64_t(0x01020304)), al );
+ a.mov(ptr(uint64_t(0x01020304)), ax );
+ a.mov(ptr(uint64_t(0x01020304)), eax);
+
+ // Special case - mov with absolute 64-bit address (X64).
+ if (isX64) a.mov(al , ptr(uint64_t(0x0102030405060708ull)));
+ if (isX64) a.mov(ax , ptr(uint64_t(0x0102030405060708ull)));
+ if (isX64) a.mov(eax, ptr(uint64_t(0x0102030405060708ull)));
+ if (isX64) a.mov(rax, ptr(uint64_t(0x0102030405060708ull)));
+ if (isX64) a.mov(ptr(uint64_t(0x0102030405060708ull)), al );
+ if (isX64) a.mov(ptr(uint64_t(0x0102030405060708ull)), ax );
+ if (isX64) a.mov(ptr(uint64_t(0x0102030405060708ull)), eax);
+ if (isX64) a.mov(ptr(uint64_t(0x0102030405060708ull)), rax);
+
+ // Control registers.
+ a.nop();
+
+ a.mov(gzA, cr0);
+ a.mov(cr0, gzA);
+ if (isX64) a.mov(gzA, cr8);
+ if (isX64) a.mov(cr8, gzA);
+
+ // Debug registers.
+ a.nop();
+
+ a.mov(gzA, dr0);
+ a.mov(dr0, gzA);
+
+ // Segment registers.
+ a.nop();
+
+ if (!isX64) a.mov(es, ax);
+ if (!isX64) a.mov(es, bx);
+ if (!isX64) a.mov(ax, es);
+ if (!isX64) a.mov(bx, es);
+
+ if (!isX64) a.mov(cs, ax);
+ if (!isX64) a.mov(cs, bx);
+ if (!isX64) a.mov(ax, cs);
+ if (!isX64) a.mov(bx, cs);
+
+ if (!isX64) a.mov(ss, ax);
+ if (!isX64) a.mov(ss, bx);
+ if (!isX64) a.mov(ax, ss);
+ if (!isX64) a.mov(bx, ss);
+
+ if (!isX64) a.mov(ds, ax);
+ if (!isX64) a.mov(ds, bx);
+ if (!isX64) a.mov(ax, ds);
+ if (!isX64) a.mov(bx, ds);
+
+ a.mov(fs, ax);
+ a.mov(fs, bx);
+ a.mov(ax, fs);
+ a.mov(bx, fs);
+
+ a.mov(gs, ax);
+ a.mov(gs, bx);
+ a.mov(ax, gs);
+ a.mov(bx, gs);
+
+ // Instructions using REP prefix.
+ a.nop();
+
+ a.in(al, 0);
+ a.in(al, dx);
+ a.in(ax, 0);
+ a.in(ax, dx);
+ a.in(eax, 0);
+ a.in(eax, dx);
+ a.rep().ins(byte_ptr(a.zdi()), dx);
+ a.rep().ins(word_ptr(a.zdi()), dx);
+ a.rep().ins(dword_ptr(a.zdi()), dx);
+
+ a.out(imm(0), al);
+ a.out(dx, al);
+ a.out(imm(0), ax);
+ a.out(dx, ax);
+ a.out(imm(0), eax);
+ a.out(dx, eax);
+ a.rep().outs(dx, byte_ptr(a.zsi()));
+ a.rep().outs(dx, word_ptr(a.zsi()));
+ a.rep().outs(dx, dword_ptr(a.zsi()));
+
+ a.lodsb();
+ a.lodsd();
+ a.lodsw();
+ a.rep().lodsb();
+ a.rep().lodsd();
+ a.rep().lodsw();
+ if (isX64) a.rep().lodsq();
+
+ a.movsb();
+ a.movsd();
+ a.movsw();
+ a.rep().movsb();
+ a.rep().movsd();
+ a.rep().movsw();
+ if (isX64) a.rep().movsq();
+
+ a.stosb();
+ a.stosd();
+ a.stosw();
+ a.rep().stosb();
+ a.rep().stosd();
+ a.rep().stosw();
+ if (isX64) a.rep().stosq();
+
+ a.cmpsb();
+ a.cmpsd();
+ a.cmpsw();
+ a.repz().cmpsb();
+ a.repz().cmpsd();
+ a.repz().cmpsw();
+ if (isX64) a.repz().cmpsq();
+ a.repnz().cmpsb();
+ a.repnz().cmpsd();
+ a.repnz().cmpsw();
+ if (isX64) a.repnz().cmpsq();
+
+ a.scasb();
+ a.scasd();
+ a.scasw();
+ a.repz().scasb();
+ a.repz().scasd();
+ a.repz().scasw();
+ if (isX64) a.repz().scasq();
+ a.repnz().scasb();
+ a.repnz().scasd();
+ a.repnz().scasw();
+ if (isX64) a.repnz().scasq();
+
+ // Label...Jcc/Jecxz/Jmp.
+ a.nop();
+
+ L = a.newLabel();
+ a.bind(L);
+ a.ja(L);
+ a.jae(L);
+ a.jb(L);
+ a.jbe(L);
+ a.jc(L);
+ a.je(L);
+ a.jg(L);
+ a.jge(L);
+ a.jl(L);
+ a.jle(L);
+ a.jna(L);
+ a.jnae(L);
+ a.jnb(L);
+ a.jnbe(L);
+ a.jnc(L);
+ a.jne(L);
+ a.jng(L);
+ a.jnge(L);
+ a.jnl(L);
+ a.jnle(L);
+ a.jno(L);
+ a.jnp(L);
+ a.jns(L);
+ a.jnz(L);
+ a.jo(L);
+ a.jp(L);
+ a.jpe(L);
+ a.jpo(L);
+ a.js(L);
+ a.jz(L);
+ a.jecxz(ecx, L);
+ a.jmp(L);
+
+ // Jcc/Jecxz/Jmp...Label.
+ a.nop();
+
+ L = a.newLabel();
+ a.ja(L);
+ a.jae(L);
+ a.jb(L);
+ a.jbe(L);
+ a.jc(L);
+ a.je(L);
+ a.jg(L);
+ a.jge(L);
+ a.jl(L);
+ a.jle(L);
+ a.jna(L);
+ a.jnae(L);
+ a.jnb(L);
+ a.jnbe(L);
+ a.jnc(L);
+ a.jne(L);
+ a.jng(L);
+ a.jnge(L);
+ a.jnl(L);
+ a.jnle(L);
+ a.jno(L);
+ a.jnp(L);
+ a.jns(L);
+ a.jnz(L);
+ a.jo(L);
+ a.jp(L);
+ a.jpe(L);
+ a.jpo(L);
+ a.js(L);
+ a.jz(L);
+ a.jecxz(ecx, L);
+ a.jmp(L);
+ a.bind(L);
+
+ // FPU.
+ a.nop();
+
+ a.f2xm1();
+ a.fabs();
+ a.fadd(fpA, fpB);
+ a.fadd(fpB, fpA);
+ a.fadd(dword_ptr(gzA));
+ a.fadd(qword_ptr(gzA));
+ a.faddp(fpB);
+ a.faddp();
+ a.fbld(dword_ptr(gzA));
+ a.fbstp(dword_ptr(gzA));
+ a.fchs();
+ a.fclex();
+ a.fcom(fpB);
+ a.fcom();
+ a.fcom(dword_ptr(gzA));
+ a.fcom(qword_ptr(gzA));
+ a.fcomp(fpB);
+ a.fcomp();
+ a.fcomp(dword_ptr(gzA));
+ a.fcomp(qword_ptr(gzA));
+ a.fcompp();
+ a.fcos();
+ a.fdecstp();
+ a.fdiv(fpA, fpB);
+ a.fdiv(fpB, fpA);
+ a.fdiv(dword_ptr(gzA));
+ a.fdiv(qword_ptr(gzA));
+ a.fdivp(fpB);
+ a.fdivp();
+ a.fdivr(fpA, fpB);
+ a.fdivr(fpB, fpA);
+ a.fdivr(dword_ptr(gzA));
+ a.fdivr(qword_ptr(gzA));
+ a.fdivrp(fpB);
+ a.fdivrp();
+ a.fiadd(dword_ptr(gzA));
+ a.ficom(word_ptr(gzA));
+ a.ficom(dword_ptr(gzA));
+ a.ficomp(word_ptr(gzA));
+ a.ficomp(dword_ptr(gzA));
+ a.fidiv(word_ptr(gzA));
+ a.fidiv(dword_ptr(gzA));
+ a.fidivr(word_ptr(gzA));
+ a.fidivr(dword_ptr(gzA));
+ a.fild(word_ptr(gzA));
+ a.fild(dword_ptr(gzA));
+ a.fild(qword_ptr(gzA));
+ a.fimul(word_ptr(gzA));
+ a.fimul(dword_ptr(gzA));
+ a.fincstp();
+ a.finit();
+ a.fninit();
+ a.fisub(word_ptr(gzA));
+ a.fisub(dword_ptr(gzA));
+ a.fisubr(word_ptr(gzA));
+ a.fisubr(dword_ptr(gzA));
+ a.fist(word_ptr(gzA));
+ a.fist(dword_ptr(gzA));
+ a.fistp(word_ptr(gzA));
+ a.fistp(dword_ptr(gzA));
+ a.fistp(qword_ptr(gzA));
+ a.fld(dword_ptr(gzA));
+ a.fld(qword_ptr(gzA));
+ a.fld(tword_ptr(gzA));
+ a.fld1();
+ a.fldl2t();
+ a.fldl2e();
+ a.fldpi();
+ a.fldlg2();
+ a.fldln2();
+ a.fldz();
+ a.fldcw(anyptr_gpA);
+ a.fldenv(anyptr_gpA);
+ a.fmul(fpA, fpB);
+ a.fmul(fpB, fpA);
+ a.fmul(dword_ptr(gzA));
+ a.fmul(qword_ptr(gzA));
+ a.fmulp(fpB);
+ a.fmulp();
+ a.fnclex();
+ a.fnop();
+ a.fnsave(anyptr_gpA);
+ a.fnstenv(anyptr_gpA);
+ a.fnstcw(anyptr_gpA);
+ a.fpatan();
+ a.fprem();
+ a.fprem1();
+ a.fptan();
+ a.frndint();
+ a.frstor(anyptr_gpA);
+ a.fsave(anyptr_gpA);
+ a.fscale();
+ a.fsin();
+ a.fsincos();
+ a.fsqrt();
+ a.fst(dword_ptr(gzA));
+ a.fst(qword_ptr(gzA));
+ a.fstp(dword_ptr(gzA));
+ a.fstp(qword_ptr(gzA));
+ a.fstp(tword_ptr(gzA));
+ a.fstcw(anyptr_gpA);
+ a.fstenv(anyptr_gpA);
+ a.fsub(fpA, fpB);
+ a.fsub(fpB, fpA);
+ a.fsub(dword_ptr(gzA));
+ a.fsub(qword_ptr(gzA));
+ a.fsubp(fpB);
+ a.fsubp();
+ a.fsubr(fpA, fpB);
+ a.fsubr(fpB, fpA);
+ a.fsubr(dword_ptr(gzA));
+ a.fsubr(qword_ptr(gzA));
+ a.fsubrp(fpB);
+ a.fsubrp();
+ a.ftst();
+ a.fucom(fpB);
+ a.fucom();
+ a.fucom(fpB);
+ a.fucomi(fpB);
+ a.fucomip(fpB);
+ a.fucomp(fpB);
+ a.fucompp();
+ a.fxam();
+ a.fxtract();
+ a.fyl2x();
+ a.fyl2xp1();
+
+ // LAHF/SAHF
+ a.lahf(); // Implicit <AH>
+ a.lahf(ah); // Explicit <AH>
+ a.sahf(); // Implicit <AH>
+ a.sahf(ah); // Explicit <AH>
+
+ // FXSR.
+ a.fxrstor(anyptr_gpA);
+ a.fxsave(anyptr_gpA);
+
+ // XSAVE.
+ a.nop();
+
+ a.xgetbv(); // Implicit <EDX:EAX>, <ECX>
+ a.xgetbv(edx, eax, ecx); // Explicit <EDX:EAX>, <ECX>
+
+ a.xsetbv(); // Implicit <EDX:EAX>, <ECX>
+ a.xsetbv(edx, eax, ecx); // Explicit <EDX:EAX>, <ECX>
+
+ a.xrstor(anyptr_gpA); // Implicit <EDX:EAX>
+ a.xrstors(anyptr_gpA); // Implicit <EDX:EAX>
+ a.xsave(anyptr_gpA); // Implicit <EDX:EAX>
+ a.xsavec(anyptr_gpA); // Implicit <EDX:EAX>
+ a.xsaveopt(anyptr_gpA); // Implicit <EDX:EAX>
+ a.xsaves(anyptr_gpA); // Implicit <EDX:EAX>
+
+ if (isX64) a.xrstor64(anyptr_gpA); // Implicit <EDX:EAX>
+ if (isX64) a.xrstors64(anyptr_gpA); // Implicit <EDX:EAX>
+ if (isX64) a.xsave64(anyptr_gpA); // Implicit <EDX:EAX>
+ if (isX64) a.xsavec64(anyptr_gpA); // Implicit <EDX:EAX>
+ if (isX64) a.xsaveopt64(anyptr_gpA); // Implicit <EDX:EAX>
+ if (isX64) a.xsaves64(anyptr_gpA); // Implicit <EDX:EAX>
+
+ // POPCNT.
+ a.nop();
+
+ a.popcnt(gdA, gdB);
+ a.popcnt(gzA, gzB);
+ a.popcnt(gdA, anyptr_gpB);
+ a.popcnt(gzA, anyptr_gpB);
+
+ // LZCNT.
+ a.nop();
+
+ a.lzcnt(gdA, gdB);
+ a.lzcnt(gzA, gzB);
+ a.lzcnt(gdA, anyptr_gpB);
+ a.lzcnt(gzA, anyptr_gpB);
+
+ // BMI.
+ a.nop();
+
+ a.andn(gdA, gdB, gdC);
+ a.andn(gzA, gzB, gzC);
+ a.andn(gdA, gdB, anyptr_gpC);
+ a.andn(gzA, gzB, anyptr_gpC);
+ a.bextr(gdA, gdB, gdC);
+ a.bextr(gzA, gzB, gzC);
+ a.bextr(gdA, anyptr_gpB, gdC);
+ a.bextr(gzA, anyptr_gpB, gzC);
+ a.blsi(gdA, gdB);
+ a.blsi(gzA, gzB);
+ a.blsi(gdA, anyptr_gpB);
+ a.blsi(gzA, anyptr_gpB);
+ a.blsmsk(gdA, gdB);
+ a.blsmsk(gzA, gzB);
+ a.blsmsk(gdA, anyptr_gpB);
+ a.blsmsk(gzA, anyptr_gpB);
+ a.blsr(gdA, gdB);
+ a.blsr(gzA, gzB);
+ a.blsr(gdA, anyptr_gpB);
+ a.blsr(gzA, anyptr_gpB);
+ a.tzcnt(gdA, gdB);
+ a.tzcnt(gzA, gzB);
+ a.tzcnt(gdA, anyptr_gpB);
+ a.tzcnt(gzA, anyptr_gpB);
+
+ // BMI2.
+ a.nop();
+
+ a.bzhi(gdA, gdB, gdC);
+ a.bzhi(gzA, gzB, gzC);
+ a.bzhi(gdA, anyptr_gpB, gdC);
+ a.bzhi(gzA, anyptr_gpB, gzC);
+ a.mulx(gdA, gdB, gdC); // Implicit gpA, gpB, gpC, <EDX>
+ a.mulx(gdA, gdB, gdC, edx); // Explicit gpA, gpB, gpC, <EDX>
+ a.mulx(gzA, gzB, gzC); // Implicit gpA, gpB, gpC, <EDX|RDX>
+ a.mulx(gzA, gzB, gzC, a.zdx()); // Explicit gpA, gpB, gpC, <EDX|RDX>
+ a.mulx(gdA, gdB, anyptr_gpC); // Implicit gpA, gpB, mem, <EDX>
+ a.mulx(gdA, gdB, anyptr_gpC, edx); // Explicit gpA, gpB, mem, <EDX>
+ a.mulx(gzA, gzB, anyptr_gpC); // Implicit gpA, gpB, mem, <EDX|RDX>
+ a.mulx(gzA, gzB, anyptr_gpC, a.zdx()); // Explicit gpA, gpB, mem, <EDX|RDX>
+ a.pdep(gdA, gdB, gdC);
+ a.pdep(gzA, gzB, gzC);
+ a.pdep(gdA, gdB, anyptr_gpC);
+ a.pdep(gzA, gzB, anyptr_gpC);
+ a.pext(gdA, gdB, gdC);
+ a.pext(gzA, gzB, gzC);
+ a.pext(gdA, gdB, anyptr_gpC);
+ a.pext(gzA, gzB, anyptr_gpC);
+ a.rorx(gdA, gdB, 0);
+ a.rorx(gzA, gzB, 0);
+ a.rorx(gdA, anyptr_gpB, 0);
+ a.rorx(gzA, anyptr_gpB, 0);
+ a.sarx(gdA, gdB, gdC);
+ a.sarx(gzA, gzB, gzC);
+ a.sarx(gdA, anyptr_gpB, gdC);
+ a.sarx(gzA, anyptr_gpB, gzC);
+ a.shlx(gdA, gdB, gdC);
+ a.shlx(gzA, gzB, gzC);
+ a.shlx(gdA, anyptr_gpB, gdC);
+ a.shlx(gzA, anyptr_gpB, gzC);
+ a.shrx(gdA, gdB, gdC);
+ a.shrx(gzA, gzB, gzC);
+ a.shrx(gdA, anyptr_gpB, gdC);
+ a.shrx(gzA, anyptr_gpB, gzC);
+
+ // ADX.
+ a.nop();
+
+ a.adcx(gdA, gdB);
+ a.adcx(gzA, gzB);
+ a.adcx(gdA, anyptr_gpB);
+ a.adcx(gzA, anyptr_gpB);
+ a.adox(gdA, gdB);
+ a.adox(gzA, gzB);
+ a.adox(gdA, anyptr_gpB);
+ a.adox(gzA, anyptr_gpB);
+
+ // TBM.
+ a.nop();
+
+ a.blcfill(gdA, gdB);
+ a.blcfill(gzA, gzB);
+ a.blcfill(gdA, anyptr_gpB);
+ a.blcfill(gzA, anyptr_gpB);
+
+ a.blci(gdA, gdB);
+ a.blci(gzA, gzB);
+ a.blci(gdA, anyptr_gpB);
+ a.blci(gzA, anyptr_gpB);
+
+ a.blcic(gdA, gdB);
+ a.blcic(gzA, gzB);
+ a.blcic(gdA, anyptr_gpB);
+ a.blcic(gzA, anyptr_gpB);
+
+ a.blcmsk(gdA, gdB);
+ a.blcmsk(gzA, gzB);
+ a.blcmsk(gdA, anyptr_gpB);
+ a.blcmsk(gzA, anyptr_gpB);
+
+ a.blcs(gdA, gdB);
+ a.blcs(gzA, gzB);
+ a.blcs(gdA, anyptr_gpB);
+ a.blcs(gzA, anyptr_gpB);
+
+ a.blsfill(gdA, gdB);
+ a.blsfill(gzA, gzB);
+ a.blsfill(gdA, anyptr_gpB);
+ a.blsfill(gzA, anyptr_gpB);
+
+ a.blsic(gdA, gdB);
+ a.blsic(gzA, gzB);
+ a.blsic(gdA, anyptr_gpB);
+ a.blsic(gzA, anyptr_gpB);
+
+ a.t1mskc(gdA, gdB);
+ a.t1mskc(gzA, gzB);
+ a.t1mskc(gdA, anyptr_gpB);
+ a.t1mskc(gzA, anyptr_gpB);
+
+ a.tzmsk(gdA, gdB);
+ a.tzmsk(gzA, gzB);
+ a.tzmsk(gdA, anyptr_gpB);
+ a.tzmsk(gzA, anyptr_gpB);
+
+ // CLFLUSH / CLFLUSH_OPT.
+ a.nop();
+ a.clflush(anyptr_gpA);
+ a.clflushopt(anyptr_gpA);
+
+ // CLWB.
+ a.nop();
+ a.clwb(anyptr_gpA);
+
+ // CLZERO.
+ a.nop();
+ a.clzero(); // Implicit <ds:[EAX|RAX]>
+ a.clzero(ptr(a.zax())); // Explicit <ds:[EAX|RAX]>
+
+ // PCOMMIT.
+ a.nop();
+ a.pcommit();
+
+ // PREFETCH / PREFETCHW / PREFETCHWT1.
+ a.nop();
+ a.prefetch(anyptr_gpA); // 3DNOW.
+ a.prefetchnta(anyptr_gpA); // MMX+SSE.
+ a.prefetcht0(anyptr_gpA); // MMX+SSE.
+ a.prefetcht1(anyptr_gpA); // MMX+SSE.
+ a.prefetcht2(anyptr_gpA); // MMX+SSE.
+ a.prefetchw(anyptr_gpA); // PREFETCHW.
+ a.prefetchwt1(anyptr_gpA); // PREFETCHWT1.
+
+ // RDRAND / RDSEED.
+ a.nop();
+
+ a.rdrand(gdA);
+ a.rdrand(gzA);
+ a.rdseed(gdA);
+ a.rdseed(gzA);
+
+ // MMX/MMX-EXT.
+ a.nop();
+
+ a.movd(anyptr_gpA, mmB);
+ a.movd(gdA, mmB);
+ a.movd(mmA, anyptr_gpB);
+ a.movd(mmA, gdB);
+ a.movq(mmA, mmB);
+ a.movq(anyptr_gpA, mmB);
+ a.movq(mmA, anyptr_gpB);
+ a.packuswb(mmA, mmB);
+ a.packuswb(mmA, anyptr_gpB);
+ a.paddb(mmA, mmB);
+ a.paddb(mmA, anyptr_gpB);
+ a.paddw(mmA, mmB);
+ a.paddw(mmA, anyptr_gpB);
+ a.paddd(mmA, mmB);
+ a.paddd(mmA, anyptr_gpB);
+ a.paddsb(mmA, mmB);
+ a.paddsb(mmA, anyptr_gpB);
+ a.paddsw(mmA, mmB);
+ a.paddsw(mmA, anyptr_gpB);
+ a.paddusb(mmA, mmB);
+ a.paddusb(mmA, anyptr_gpB);
+ a.paddusw(mmA, mmB);
+ a.paddusw(mmA, anyptr_gpB);
+ a.pand(mmA, mmB);
+ a.pand(mmA, anyptr_gpB);
+ a.pandn(mmA, mmB);
+ a.pandn(mmA, anyptr_gpB);
+ a.pcmpeqb(mmA, mmB);
+ a.pcmpeqb(mmA, anyptr_gpB);
+ a.pcmpeqw(mmA, mmB);
+ a.pcmpeqw(mmA, anyptr_gpB);
+ a.pcmpeqd(mmA, mmB);
+ a.pcmpeqd(mmA, anyptr_gpB);
+ a.pcmpgtb(mmA, mmB);
+ a.pcmpgtb(mmA, anyptr_gpB);
+ a.pcmpgtw(mmA, mmB);
+ a.pcmpgtw(mmA, anyptr_gpB);
+ a.pcmpgtd(mmA, mmB);
+ a.pcmpgtd(mmA, anyptr_gpB);
+ a.pmulhw(mmA, mmB);
+ a.pmulhw(mmA, anyptr_gpB);
+ a.pmullw(mmA, mmB);
+ a.pmullw(mmA, anyptr_gpB);
+ a.por(mmA, mmB);
+ a.por(mmA, anyptr_gpB);
+ a.pmaddwd(mmA, mmB);
+ a.pmaddwd(mmA, anyptr_gpB);
+ a.pslld(mmA, mmB);
+ a.pslld(mmA, anyptr_gpB);
+ a.pslld(mmA, 0);
+ a.psllq(mmA, mmB);
+ a.psllq(mmA, anyptr_gpB);
+ a.psllq(mmA, 0);
+ a.psllw(mmA, mmB);
+ a.psllw(mmA, anyptr_gpB);
+ a.psllw(mmA, 0);
+ a.psrad(mmA, mmB);
+ a.psrad(mmA, anyptr_gpB);
+ a.psrad(mmA, 0);
+ a.psraw(mmA, mmB);
+ a.psraw(mmA, anyptr_gpB);
+ a.psraw(mmA, 0);
+ a.psrld(mmA, mmB);
+ a.psrld(mmA, anyptr_gpB);
+ a.psrld(mmA, 0);
+ a.psrlq(mmA, mmB);
+ a.psrlq(mmA, anyptr_gpB);
+ a.psrlq(mmA, 0);
+ a.psrlw(mmA, mmB);
+ a.psrlw(mmA, anyptr_gpB);
+ a.psrlw(mmA, 0);
+ a.psubb(mmA, mmB);
+ a.psubb(mmA, anyptr_gpB);
+ a.psubw(mmA, mmB);
+ a.psubw(mmA, anyptr_gpB);
+ a.psubd(mmA, mmB);
+ a.psubd(mmA, anyptr_gpB);
+ a.psubsb(mmA, mmB);
+ a.psubsb(mmA, anyptr_gpB);
+ a.psubsw(mmA, mmB);
+ a.psubsw(mmA, anyptr_gpB);
+ a.psubusb(mmA, mmB);
+ a.psubusb(mmA, anyptr_gpB);
+ a.psubusw(mmA, mmB);
+ a.psubusw(mmA, anyptr_gpB);
+ a.punpckhbw(mmA, mmB);
+ a.punpckhbw(mmA, anyptr_gpB);
+ a.punpckhwd(mmA, mmB);
+ a.punpckhwd(mmA, anyptr_gpB);
+ a.punpckhdq(mmA, mmB);
+ a.punpckhdq(mmA, anyptr_gpB);
+ a.punpcklbw(mmA, mmB);
+ a.punpcklbw(mmA, anyptr_gpB);
+ a.punpcklwd(mmA, mmB);
+ a.punpcklwd(mmA, anyptr_gpB);
+ a.punpckldq(mmA, mmB);
+ a.punpckldq(mmA, anyptr_gpB);
+ a.pxor(mmA, mmB);
+ a.pxor(mmA, anyptr_gpB);
+ a.emms();
+
+ // 3DNOW.
+ a.nop();
+
+ a.pavgusb(mmA, mmB);
+ a.pavgusb(mmA, anyptr_gpB);
+ a.pf2id(mmA, mmB);
+ a.pf2id(mmA, anyptr_gpB);
+ a.pf2iw(mmA, mmB);
+ a.pf2iw(mmA, anyptr_gpB);
+ a.pfacc(mmA, mmB);
+ a.pfacc(mmA, anyptr_gpB);
+ a.pfadd(mmA, mmB);
+ a.pfadd(mmA, anyptr_gpB);
+ a.pfcmpeq(mmA, mmB);
+ a.pfcmpeq(mmA, anyptr_gpB);
+ a.pfcmpge(mmA, mmB);
+ a.pfcmpge(mmA, anyptr_gpB);
+ a.pfcmpgt(mmA, mmB);
+ a.pfcmpgt(mmA, anyptr_gpB);
+ a.pfmax(mmA, mmB);
+ a.pfmax(mmA, anyptr_gpB);
+ a.pfmin(mmA, mmB);
+ a.pfmin(mmA, anyptr_gpB);
+ a.pfmul(mmA, mmB);
+ a.pfmul(mmA, anyptr_gpB);
+ a.pfnacc(mmA, mmB);
+ a.pfnacc(mmA, anyptr_gpB);
+ a.pfpnacc(mmA, mmB);
+ a.pfpnacc(mmA, anyptr_gpB);
+ a.pfrcp(mmA, mmB);
+ a.pfrcp(mmA, anyptr_gpB);
+ a.pfrcpit1(mmA, mmB);
+ a.pfrcpit1(mmA, anyptr_gpB);
+ a.pfrcpit2(mmA, mmB);
+ a.pfrcpit2(mmA, anyptr_gpB);
+ a.pfrcpv(mmA, mmB);
+ a.pfrcpv(mmA, anyptr_gpB);
+ a.pfrsqit1(mmA, mmB);
+ a.pfrsqit1(mmA, anyptr_gpB);
+ a.pfrsqrt(mmA, mmB);
+ a.pfrsqrt(mmA, anyptr_gpB);
+ a.pfrsqrtv(mmA, mmB);
+ a.pfrsqrtv(mmA, anyptr_gpB);
+ a.pfsub(mmA, mmB);
+ a.pfsub(mmA, anyptr_gpB);
+ a.pfsubr(mmA, mmB);
+ a.pfsubr(mmA, anyptr_gpB);
+ a.pi2fd(mmA, mmB);
+ a.pi2fd(mmA, anyptr_gpB);
+ a.pi2fw(mmA, mmB);
+ a.pi2fw(mmA, anyptr_gpB);
+ a.pmulhrw(mmA, mmB);
+ a.pmulhrw(mmA, anyptr_gpB);
+ a.pswapd(mmA, mmB);
+ a.pswapd(mmA, anyptr_gpB);
+ a.femms();
+
+ // SSE.
+ a.nop();
+
+ a.addps(xmmA, xmmB);
+ a.addps(xmmA, anyptr_gpB);
+ a.addss(xmmA, xmmB);
+ a.addss(xmmA, anyptr_gpB);
+ a.andnps(xmmA, xmmB);
+ a.andnps(xmmA, anyptr_gpB);
+ a.andps(xmmA, xmmB);
+ a.andps(xmmA, anyptr_gpB);
+ a.cmpps(xmmA, xmmB, 0);
+ a.cmpps(xmmA, anyptr_gpB, 0);
+ a.cmpss(xmmA, xmmB, 0);
+ a.cmpss(xmmA, anyptr_gpB, 0);
+ a.comiss(xmmA, xmmB);
+ a.comiss(xmmA, anyptr_gpB);
+ a.cvtpi2ps(xmmA, mmB);
+ a.cvtpi2ps(xmmA, anyptr_gpB);
+ a.cvtps2pi(mmA, xmmB);
+ a.cvtps2pi(mmA, anyptr_gpB);
+ a.cvtsi2ss(xmmA, gdB);
+ a.cvtsi2ss(xmmA, gzB);
+ a.cvtsi2ss(xmmA, anyptr_gpB);
+ a.cvtss2si(gdA, xmmB);
+ a.cvtss2si(gzA, xmmB);
+ a.cvtss2si(gdA, anyptr_gpB);
+ a.cvtss2si(gzA, anyptr_gpB);
+ a.cvttps2pi(mmA, xmmB);
+ a.cvttps2pi(mmA, anyptr_gpB);
+ a.cvttss2si(gdA, xmmB);
+ a.cvttss2si(gzA, xmmB);
+ a.cvttss2si(gdA, anyptr_gpB);
+ a.cvttss2si(gzA, anyptr_gpB);
+ a.divps(xmmA, xmmB);
+ a.divps(xmmA, anyptr_gpB);
+ a.divss(xmmA, xmmB);
+ a.divss(xmmA, anyptr_gpB);
+ a.ldmxcsr(anyptr_gpA);
+ a.maskmovq(mmA, mmB); // Implicit mmA, mmB, <ds:[EDI|RDI]>
+ a.maskmovq(mmA, mmB, ptr(a.zdi())); // Explicit mmA, mmB, <ds:[EDI|RDI]>
+ a.maxps(xmmA, xmmB);
+ a.maxps(xmmA, anyptr_gpB);
+ a.maxss(xmmA, xmmB);
+ a.maxss(xmmA, anyptr_gpB);
+ a.minps(xmmA, xmmB);
+ a.minps(xmmA, anyptr_gpB);
+ a.minss(xmmA, xmmB);
+ a.minss(xmmA, anyptr_gpB);
+ a.movaps(xmmA, xmmB);
+ a.movaps(xmmA, anyptr_gpB);
+ a.movaps(anyptr_gpA, xmmB);
+ a.movd(anyptr_gpA, xmmB);
+ a.movd(gdA, xmmB);
+ a.movd(gzA, xmmB);
+ a.movd(xmmA, anyptr_gpB);
+ a.movd(xmmA, gdB);
+ a.movd(xmmA, gzB);
+ a.movq(mmA, mmB);
+ a.movq(xmmA, xmmB);
+ a.movq(anyptr_gpA, xmmB);
+ a.movq(xmmA, anyptr_gpB);
+ a.movntq(anyptr_gpA, mmB);
+ a.movhlps(xmmA, xmmB);
+ a.movhps(xmmA, anyptr_gpB);
+ a.movhps(anyptr_gpA, xmmB);
+ a.movlhps(xmmA, xmmB);
+ a.movlps(xmmA, anyptr_gpB);
+ a.movlps(anyptr_gpA, xmmB);
+ a.movntps(anyptr_gpA, xmmB);
+ a.movss(xmmA, anyptr_gpB);
+ a.movss(anyptr_gpA, xmmB);
+ a.movups(xmmA, xmmB);
+ a.movups(xmmA, anyptr_gpB);
+ a.movups(anyptr_gpA, xmmB);
+ a.mulps(xmmA, xmmB);
+ a.mulps(xmmA, anyptr_gpB);
+ a.mulss(xmmA, xmmB);
+ a.mulss(xmmA, anyptr_gpB);
+ a.orps(xmmA, xmmB);
+ a.orps(xmmA, anyptr_gpB);
+ a.pavgb(mmA, mmB);
+ a.pavgb(mmA, anyptr_gpB);
+ a.pavgw(mmA, mmB);
+ a.pavgw(mmA, anyptr_gpB);
+ a.pextrw(gdA, mmB, 0);
+ a.pextrw(gzA, mmB, 0);
+ a.pinsrw(mmA, gdB, 0);
+ a.pinsrw(mmA, gzB, 0);
+ a.pinsrw(mmA, anyptr_gpB, 0);
+ a.pmaxsw(mmA, mmB);
+ a.pmaxsw(mmA, anyptr_gpB);
+ a.pmaxub(mmA, mmB);
+ a.pmaxub(mmA, anyptr_gpB);
+ a.pminsw(mmA, mmB);
+ a.pminsw(mmA, anyptr_gpB);
+ a.pminub(mmA, mmB);
+ a.pminub(mmA, anyptr_gpB);
+ a.pmovmskb(gdA, mmB);
+ a.pmovmskb(gzA, mmB);
+ a.pmulhuw(mmA, mmB);
+ a.pmulhuw(mmA, anyptr_gpB);
+ a.psadbw(mmA, mmB);
+ a.psadbw(mmA, anyptr_gpB);
+ a.pshufw(mmA, mmB, 0);
+ a.pshufw(mmA, anyptr_gpB, 0);
+ a.rcpps(xmmA, xmmB);
+ a.rcpps(xmmA, anyptr_gpB);
+ a.rcpss(xmmA, xmmB);
+ a.rcpss(xmmA, anyptr_gpB);
+ a.psadbw(xmmA, xmmB);
+ a.psadbw(xmmA, anyptr_gpB);
+ a.rsqrtps(xmmA, xmmB);
+ a.rsqrtps(xmmA, anyptr_gpB);
+ a.rsqrtss(xmmA, xmmB);
+ a.rsqrtss(xmmA, anyptr_gpB);
+ a.sfence();
+ a.shufps(xmmA, xmmB, 0);
+ a.shufps(xmmA, anyptr_gpB, 0);
+ a.sqrtps(xmmA, xmmB);
+ a.sqrtps(xmmA, anyptr_gpB);
+ a.sqrtss(xmmA, xmmB);
+ a.sqrtss(xmmA, anyptr_gpB);
+ a.stmxcsr(anyptr_gpA);
+ a.subps(xmmA, xmmB);
+ a.subps(xmmA, anyptr_gpB);
+ a.subss(xmmA, xmmB);
+ a.subss(xmmA, anyptr_gpB);
+ a.ucomiss(xmmA, xmmB);
+ a.ucomiss(xmmA, anyptr_gpB);
+ a.unpckhps(xmmA, xmmB);
+ a.unpckhps(xmmA, anyptr_gpB);
+ a.unpcklps(xmmA, xmmB);
+ a.unpcklps(xmmA, anyptr_gpB);
+ a.xorps(xmmA, xmmB);
+ a.xorps(xmmA, anyptr_gpB);
+
+ // SSE2.
+ a.nop();
+
+ a.addpd(xmmA, xmmB);
+ a.addpd(xmmA, anyptr_gpB);
+ a.addsd(xmmA, xmmB);
+ a.addsd(xmmA, anyptr_gpB);
+ a.andnpd(xmmA, xmmB);
+ a.andnpd(xmmA, anyptr_gpB);
+ a.andpd(xmmA, xmmB);
+ a.andpd(xmmA, anyptr_gpB);
+ a.cmppd(xmmA, xmmB, 0);
+ a.cmppd(xmmA, anyptr_gpB, 0);
+ a.cmpsd(xmmA, xmmB, 0);
+ a.cmpsd(xmmA, anyptr_gpB, 0);
+ a.comisd(xmmA, xmmB);
+ a.comisd(xmmA, anyptr_gpB);
+ a.cvtdq2pd(xmmA, xmmB);
+ a.cvtdq2pd(xmmA, anyptr_gpB);
+ a.cvtdq2ps(xmmA, xmmB);
+ a.cvtdq2ps(xmmA, anyptr_gpB);
+ a.cvtpd2dq(xmmA, xmmB);
+ a.cvtpd2dq(xmmA, anyptr_gpB);
+ a.cvtpd2pi(mmA, xmmB);
+ a.cvtpd2pi(mmA, anyptr_gpB);
+ a.cvtpd2ps(xmmA, xmmB);
+ a.cvtpd2ps(xmmA, anyptr_gpB);
+ a.cvtpi2pd(xmmA, mmB);
+ a.cvtpi2pd(xmmA, anyptr_gpB);
+ a.cvtps2dq(xmmA, xmmB);
+ a.cvtps2dq(xmmA, anyptr_gpB);
+ a.cvtps2pd(xmmA, xmmB);
+ a.cvtps2pd(xmmA, anyptr_gpB);
+ a.cvtsd2si(gdA, xmmB);
+ a.cvtsd2si(gzA, xmmB);
+ a.cvtsd2si(gdA, anyptr_gpB);
+ a.cvtsd2si(gzA, anyptr_gpB);
+ a.cvtsd2ss(xmmA, xmmB);
+ a.cvtsd2ss(xmmA, anyptr_gpB);
+ a.cvtsi2sd(xmmA, gdB);
+ a.cvtsi2sd(xmmA, gzB);
+ a.cvtsi2sd(xmmA, anyptr_gpB);
+ a.cvtss2sd(xmmA, xmmB);
+ a.cvtss2sd(xmmA, anyptr_gpB);
+ a.cvtss2si(gdA, xmmB);
+ a.cvtss2si(gzA, xmmB);
+ a.cvtss2si(gdA, anyptr_gpB);
+ a.cvtss2si(gzA, anyptr_gpB);
+ a.cvttpd2pi(mmA, xmmB);
+ a.cvttpd2pi(mmA, anyptr_gpB);
+ a.cvttpd2dq(xmmA, xmmB);
+ a.cvttpd2dq(xmmA, anyptr_gpB);
+ a.cvttps2dq(xmmA, xmmB);
+ a.cvttps2dq(xmmA, anyptr_gpB);
+ a.cvttsd2si(gdA, xmmB);
+ a.cvttsd2si(gzA, xmmB);
+ a.cvttsd2si(gdA, anyptr_gpB);
+ a.cvttsd2si(gzA, anyptr_gpB);
+ a.divpd(xmmA, xmmB);
+ a.divpd(xmmA, anyptr_gpB);
+ a.divsd(xmmA, xmmB);
+ a.divsd(xmmA, anyptr_gpB);
+ a.lfence();
+ a.maskmovdqu(xmmA, xmmB); // Implicit xmmA, xmmB, <ds:[EDI|RDI]>
+ a.maskmovdqu(xmmA, xmmB, ptr(a.zdi())); // Explicit xmmA, xmmB, <ds:[EDI|RDI]>
+ a.maxpd(xmmA, xmmB);
+ a.maxpd(xmmA, anyptr_gpB);
+ a.maxsd(xmmA, xmmB);
+ a.maxsd(xmmA, anyptr_gpB);
+ a.mfence();
+ a.minpd(xmmA, xmmB);
+ a.minpd(xmmA, anyptr_gpB);
+ a.minsd(xmmA, xmmB);
+ a.minsd(xmmA, anyptr_gpB);
+ a.movdqa(xmmA, xmmB);
+ a.movdqa(xmmA, anyptr_gpB);
+ a.movdqa(anyptr_gpA, xmmB);
+ a.movdqu(xmmA, xmmB);
+ a.movdqu(xmmA, anyptr_gpB);
+ a.movdqu(anyptr_gpA, xmmB);
+ a.movmskps(gdA, xmmB);
+ a.movmskps(gzA, xmmB);
+ a.movmskpd(gdA, xmmB);
+ a.movmskpd(gzA, xmmB);
+ a.movsd(xmmA, xmmB);
+ a.movsd(xmmA, anyptr_gpB);
+ a.movsd(anyptr_gpA, xmmB);
+ a.movapd(xmmA, anyptr_gpB);
+ a.movapd(anyptr_gpA, xmmB);
+ a.movdq2q(mmA, xmmB);
+ a.movq2dq(xmmA, mmB);
+ a.movhpd(xmmA, anyptr_gpB);
+ a.movhpd(anyptr_gpA, xmmB);
+ a.movlpd(xmmA, anyptr_gpB);
+ a.movlpd(anyptr_gpA, xmmB);
+ a.movntdq(anyptr_gpA, xmmB);
+ a.movnti(anyptr_gpA, gdB);
+ a.movnti(anyptr_gpA, gzB);
+ a.movntpd(anyptr_gpA, xmmB);
+ a.movupd(xmmA, anyptr_gpB);
+ a.movupd(anyptr_gpA, xmmB);
+ a.mulpd(xmmA, xmmB);
+ a.mulpd(xmmA, anyptr_gpB);
+ a.mulsd(xmmA, xmmB);
+ a.mulsd(xmmA, anyptr_gpB);
+ a.orpd(xmmA, xmmB);
+ a.orpd(xmmA, anyptr_gpB);
+ a.packsswb(xmmA, xmmB);
+ a.packsswb(xmmA, anyptr_gpB);
+ a.packssdw(xmmA, xmmB);
+ a.packssdw(xmmA, anyptr_gpB);
+ a.packuswb(xmmA, xmmB);
+ a.packuswb(xmmA, anyptr_gpB);
+ a.paddb(xmmA, xmmB);
+ a.paddb(xmmA, anyptr_gpB);
+ a.paddw(xmmA, xmmB);
+ a.paddw(xmmA, anyptr_gpB);
+ a.paddd(xmmA, xmmB);
+ a.paddd(xmmA, anyptr_gpB);
+ a.paddq(mmA, mmB);
+ a.paddq(mmA, anyptr_gpB);
+ a.paddq(xmmA, xmmB);
+ a.paddq(xmmA, anyptr_gpB);
+ a.paddsb(xmmA, xmmB);
+ a.paddsb(xmmA, anyptr_gpB);
+ a.paddsw(xmmA, xmmB);
+ a.paddsw(xmmA, anyptr_gpB);
+ a.paddusb(xmmA, xmmB);
+ a.paddusb(xmmA, anyptr_gpB);
+ a.paddusw(xmmA, xmmB);
+ a.paddusw(xmmA, anyptr_gpB);
+ a.pand(xmmA, xmmB);
+ a.pand(xmmA, anyptr_gpB);
+ a.pandn(xmmA, xmmB);
+ a.pandn(xmmA, anyptr_gpB);
+ a.pause();
+ a.pavgb(xmmA, xmmB);
+ a.pavgb(xmmA, anyptr_gpB);
+ a.pavgw(xmmA, xmmB);
+ a.pavgw(xmmA, anyptr_gpB);
+ a.pcmpeqb(xmmA, xmmB);
+ a.pcmpeqb(xmmA, anyptr_gpB);
+ a.pcmpeqw(xmmA, xmmB);
+ a.pcmpeqw(xmmA, anyptr_gpB);
+ a.pcmpeqd(xmmA, xmmB);
+ a.pcmpeqd(xmmA, anyptr_gpB);
+ a.pcmpgtb(xmmA, xmmB);
+ a.pcmpgtb(xmmA, anyptr_gpB);
+ a.pcmpgtw(xmmA, xmmB);
+ a.pcmpgtw(xmmA, anyptr_gpB);
+ a.pcmpgtd(xmmA, xmmB);
+ a.pcmpgtd(xmmA, anyptr_gpB);
+ a.pmaxsw(xmmA, xmmB);
+ a.pmaxsw(xmmA, anyptr_gpB);
+ a.pmaxub(xmmA, xmmB);
+ a.pmaxub(xmmA, anyptr_gpB);
+ a.pminsw(xmmA, xmmB);
+ a.pminsw(xmmA, anyptr_gpB);
+ a.pminub(xmmA, xmmB);
+ a.pminub(xmmA, anyptr_gpB);
+ a.pmovmskb(gdA, xmmB);
+ a.pmovmskb(gzA, xmmB);
+ a.pmulhw(xmmA, xmmB);
+ a.pmulhw(xmmA, anyptr_gpB);
+ a.pmulhuw(xmmA, xmmB);
+ a.pmulhuw(xmmA, anyptr_gpB);
+ a.pmullw(xmmA, xmmB);
+ a.pmullw(xmmA, anyptr_gpB);
+ a.pmuludq(mmA, mmB);
+ a.pmuludq(mmA, anyptr_gpB);
+ a.pmuludq(xmmA, xmmB);
+ a.pmuludq(xmmA, anyptr_gpB);
+ a.por(xmmA, xmmB);
+ a.por(xmmA, anyptr_gpB);
+ a.pslld(xmmA, xmmB);
+ a.pslld(xmmA, anyptr_gpB);
+ a.pslld(xmmA, 0);
+ a.psllq(xmmA, xmmB);
+ a.psllq(xmmA, anyptr_gpB);
+ a.psllq(xmmA, 0);
+ a.psllw(xmmA, xmmB);
+ a.psllw(xmmA, anyptr_gpB);
+ a.psllw(xmmA, 0);
+ a.pslldq(xmmA, 0);
+ a.psrad(xmmA, xmmB);
+ a.psrad(xmmA, anyptr_gpB);
+ a.psrad(xmmA, 0);
+ a.psraw(xmmA, xmmB);
+ a.psraw(xmmA, anyptr_gpB);
+ a.psraw(xmmA, 0);
+ a.psubb(xmmA, xmmB);
+ a.psubb(xmmA, anyptr_gpB);
+ a.psubw(xmmA, xmmB);
+ a.psubw(xmmA, anyptr_gpB);
+ a.psubd(xmmA, xmmB);
+ a.psubd(xmmA, anyptr_gpB);
+ a.psubq(mmA, mmB);
+ a.psubq(mmA, anyptr_gpB);
+ a.psubq(xmmA, xmmB);
+ a.psubq(xmmA, anyptr_gpB);
+ a.pmaddwd(xmmA, xmmB);
+ a.pmaddwd(xmmA, anyptr_gpB);
+ a.pshufd(xmmA, xmmB, 0);
+ a.pshufd(xmmA, anyptr_gpB, 0);
+ a.pshufhw(xmmA, xmmB, 0);
+ a.pshufhw(xmmA, anyptr_gpB, 0);
+ a.pshuflw(xmmA, xmmB, 0);
+ a.pshuflw(xmmA, anyptr_gpB, 0);
+ a.psrld(xmmA, xmmB);
+ a.psrld(xmmA, anyptr_gpB);
+ a.psrld(xmmA, 0);
+ a.psrlq(xmmA, xmmB);
+ a.psrlq(xmmA, anyptr_gpB);
+ a.psrlq(xmmA, 0);
+ a.psrldq(xmmA, 0);
+ a.psrlw(xmmA, xmmB);
+ a.psrlw(xmmA, anyptr_gpB);
+ a.psrlw(xmmA, 0);
+ a.psubsb(xmmA, xmmB);
+ a.psubsb(xmmA, anyptr_gpB);
+ a.psubsw(xmmA, xmmB);
+ a.psubsw(xmmA, anyptr_gpB);
+ a.psubusb(xmmA, xmmB);
+ a.psubusb(xmmA, anyptr_gpB);
+ a.psubusw(xmmA, xmmB);
+ a.psubusw(xmmA, anyptr_gpB);
+ a.punpckhbw(xmmA, xmmB);
+ a.punpckhbw(xmmA, anyptr_gpB);
+ a.punpckhwd(xmmA, xmmB);
+ a.punpckhwd(xmmA, anyptr_gpB);
+ a.punpckhdq(xmmA, xmmB);
+ a.punpckhdq(xmmA, anyptr_gpB);
+ a.punpckhqdq(xmmA, xmmB);
+ a.punpckhqdq(xmmA, anyptr_gpB);
+ a.punpcklbw(xmmA, xmmB);
+ a.punpcklbw(xmmA, anyptr_gpB);
+ a.punpcklwd(xmmA, xmmB);
+ a.punpcklwd(xmmA, anyptr_gpB);
+ a.punpckldq(xmmA, xmmB);
+ a.punpckldq(xmmA, anyptr_gpB);
+ a.punpcklqdq(xmmA, xmmB);
+ a.punpcklqdq(xmmA, anyptr_gpB);
+ a.pxor(xmmA, xmmB);
+ a.pxor(xmmA, anyptr_gpB);
+ a.sqrtpd(xmmA, xmmB);
+ a.sqrtpd(xmmA, anyptr_gpB);
+ a.sqrtsd(xmmA, xmmB);
+ a.sqrtsd(xmmA, anyptr_gpB);
+ a.subpd(xmmA, xmmB);
+ a.subpd(xmmA, anyptr_gpB);
+ a.subsd(xmmA, xmmB);
+ a.subsd(xmmA, anyptr_gpB);
+ a.ucomisd(xmmA, xmmB);
+ a.ucomisd(xmmA, anyptr_gpB);
+ a.unpckhpd(xmmA, xmmB);
+ a.unpckhpd(xmmA, anyptr_gpB);
+ a.unpcklpd(xmmA, xmmB);
+ a.unpcklpd(xmmA, anyptr_gpB);
+ a.xorpd(xmmA, xmmB);
+ a.xorpd(xmmA, anyptr_gpB);
+
+ // SSE3.
+ a.nop();
+
+ a.addsubpd(xmmA, xmmB);
+ a.addsubpd(xmmA, anyptr_gpB);
+ a.addsubps(xmmA, xmmB);
+ a.addsubps(xmmA, anyptr_gpB);
+ a.fisttp(dword_ptr(gzA));
+ a.haddpd(xmmA, xmmB);
+ a.haddpd(xmmA, anyptr_gpB);
+ a.haddps(xmmA, xmmB);
+ a.haddps(xmmA, anyptr_gpB);
+ a.hsubpd(xmmA, xmmB);
+ a.hsubpd(xmmA, anyptr_gpB);
+ a.hsubps(xmmA, xmmB);
+ a.hsubps(xmmA, anyptr_gpB);
+ a.lddqu(xmmA, anyptr_gpB);
+ a.monitor();
+ a.movddup(xmmA, xmmB);
+ a.movddup(xmmA, anyptr_gpB);
+ a.movshdup(xmmA, xmmB);
+ a.movshdup(xmmA, anyptr_gpB);
+ a.movsldup(xmmA, xmmB);
+ a.movsldup(xmmA, anyptr_gpB);
+ a.mwait();
+
+ // SSSE3.
+ a.nop();
+
+ a.psignb(mmA, mmB);
+ a.psignb(mmA, anyptr_gpB);
+ a.psignb(xmmA, xmmB);
+ a.psignb(xmmA, anyptr_gpB);
+ a.psignw(mmA, mmB);
+ a.psignw(mmA, anyptr_gpB);
+ a.psignw(xmmA, xmmB);
+ a.psignw(xmmA, anyptr_gpB);
+ a.psignd(mmA, mmB);
+ a.psignd(mmA, anyptr_gpB);
+ a.psignd(xmmA, xmmB);
+ a.psignd(xmmA, anyptr_gpB);
+ a.phaddw(mmA, mmB);
+ a.phaddw(mmA, anyptr_gpB);
+ a.phaddw(xmmA, xmmB);
+ a.phaddw(xmmA, anyptr_gpB);
+ a.phaddd(mmA, mmB);
+ a.phaddd(mmA, anyptr_gpB);
+ a.phaddd(xmmA, xmmB);
+ a.phaddd(xmmA, anyptr_gpB);
+ a.phaddsw(mmA, mmB);
+ a.phaddsw(mmA, anyptr_gpB);
+ a.phaddsw(xmmA, xmmB);
+ a.phaddsw(xmmA, anyptr_gpB);
+ a.phsubw(mmA, mmB);
+ a.phsubw(mmA, anyptr_gpB);
+ a.phsubw(xmmA, xmmB);
+ a.phsubw(xmmA, anyptr_gpB);
+ a.phsubd(mmA, mmB);
+ a.phsubd(mmA, anyptr_gpB);
+ a.phsubd(xmmA, xmmB);
+ a.phsubd(xmmA, anyptr_gpB);
+ a.phsubsw(mmA, mmB);
+ a.phsubsw(mmA, anyptr_gpB);
+ a.phsubsw(xmmA, xmmB);
+ a.phsubsw(xmmA, anyptr_gpB);
+ a.pmaddubsw(mmA, mmB);
+ a.pmaddubsw(mmA, anyptr_gpB);
+ a.pmaddubsw(xmmA, xmmB);
+ a.pmaddubsw(xmmA, anyptr_gpB);
+ a.pabsb(mmA, mmB);
+ a.pabsb(mmA, anyptr_gpB);
+ a.pabsb(xmmA, xmmB);
+ a.pabsb(xmmA, anyptr_gpB);
+ a.pabsw(mmA, mmB);
+ a.pabsw(mmA, anyptr_gpB);
+ a.pabsw(xmmA, xmmB);
+ a.pabsw(xmmA, anyptr_gpB);
+ a.pabsd(mmA, mmB);
+ a.pabsd(mmA, anyptr_gpB);
+ a.pabsd(xmmA, xmmB);
+ a.pabsd(xmmA, anyptr_gpB);
+ a.pmulhrsw(mmA, mmB);
+ a.pmulhrsw(mmA, anyptr_gpB);
+ a.pmulhrsw(xmmA, xmmB);
+ a.pmulhrsw(xmmA, anyptr_gpB);
+ a.pshufb(mmA, mmB);
+ a.pshufb(mmA, anyptr_gpB);
+ a.pshufb(xmmA, xmmB);
+ a.pshufb(xmmA, anyptr_gpB);
+ a.palignr(mmA, mmB, 0);
+ a.palignr(mmA, anyptr_gpB, 0);
+ a.palignr(xmmA, xmmB, 0);
+ a.palignr(xmmA, anyptr_gpB, 0);
+
+ // SSE4.1.
+ a.nop();
+
+ a.blendpd(xmmA, xmmB, 0);
+ a.blendpd(xmmA, anyptr_gpB, 0);
+ a.blendps(xmmA, xmmB, 0);
+ a.blendps(xmmA, anyptr_gpB, 0);
+ a.blendvpd(xmmA, xmmB); // Implicit xmmA, xmmB, <XMM0>
+ a.blendvpd(xmmA, xmmB, xmm0); // Explicit xmmA, xmmB, <XMM0>
+ a.blendvpd(xmmA, anyptr_gpB); // Implicit xmmA, mem , <XMM0>
+ a.blendvpd(xmmA, anyptr_gpB, xmm0); // Explicit xmmA, mem , <XMM0>
+ a.blendvps(xmmA, xmmB); // Implicit xmmA, xmmB, <XMM0>
+ a.blendvps(xmmA, xmmB, xmm0); // Explicit xmmA, xmmB, <XMM0>
+ a.blendvps(xmmA, anyptr_gpB); // Implicit xmmA, mem , <XMM0>
+ a.blendvps(xmmA, anyptr_gpB, xmm0); // Explicit xmmA, mem , <XMM0>
+
+ a.dppd(xmmA, xmmB, 0);
+ a.dppd(xmmA, anyptr_gpB, 0);
+ a.dpps(xmmA, xmmB, 0);
+ a.dpps(xmmA, anyptr_gpB, 0);
+ a.extractps(gdA, xmmB, 0);
+ a.extractps(gzA, xmmB, 0);
+ a.extractps(anyptr_gpA, xmmB, 0);
+ a.insertps(xmmA, xmmB, 0);
+ a.insertps(xmmA, anyptr_gpB, 0);
+ a.movntdqa(xmmA, anyptr_gpB);
+ a.mpsadbw(xmmA, xmmB, 0);
+ a.mpsadbw(xmmA, anyptr_gpB, 0);
+ a.packusdw(xmmA, xmmB);
+ a.packusdw(xmmA, anyptr_gpB);
+ a.pblendvb(xmmA, xmmB); // Implicit xmmA, xmmB, <XMM0>
+ a.pblendvb(xmmA, xmmB, xmm0); // Explicit xmmA, xmmB, <XMM0>
+ a.pblendvb(xmmA, anyptr_gpB); // Implicit xmmA, mem, <XMM0>
+ a.pblendvb(xmmA, anyptr_gpB, xmm0); // Implicit xmmA, mem, <XMM0>
+ a.pblendw(xmmA, xmmB, 0);
+ a.pblendw(xmmA, anyptr_gpB, 0);
+ a.pcmpeqq(xmmA, xmmB);
+ a.pcmpeqq(xmmA, anyptr_gpB);
+ a.pextrb(gdA, xmmB, 0);
+ a.pextrb(gzA, xmmB, 0);
+ a.pextrb(anyptr_gpA, xmmB, 0);
+ a.pextrd(gdA, xmmB, 0);
+ a.pextrd(gzA, xmmB, 0);
+ a.pextrd(anyptr_gpA, xmmB, 0);
+ if (isX64) a.pextrq(gzA, xmmB, 0);
+ if (isX64) a.pextrq(anyptr_gpA, xmmB, 0);
+ a.pextrw(gdA, xmmB, 0);
+ a.pextrw(gzA, xmmB, 0);
+ a.pextrw(anyptr_gpA, xmmB, 0);
+ a.phminposuw(xmmA, xmmB);
+ a.phminposuw(xmmA, anyptr_gpB);
+ a.pinsrb(xmmA, gdB, 0);
+ a.pinsrb(xmmA, gzB, 0);
+ a.pinsrb(xmmA, anyptr_gpB, 0);
+ a.pinsrd(xmmA, gdB, 0);
+ a.pinsrd(xmmA, gzB, 0);
+ a.pinsrd(xmmA, anyptr_gpB, 0);
+ a.pinsrw(xmmA, gdB, 0);
+ a.pinsrw(xmmA, gzB, 0);
+ a.pinsrw(xmmA, anyptr_gpB, 0);
+ a.pmaxuw(xmmA, xmmB);
+ a.pmaxuw(xmmA, anyptr_gpB);
+ a.pmaxsb(xmmA, xmmB);
+ a.pmaxsb(xmmA, anyptr_gpB);
+ a.pmaxsd(xmmA, xmmB);
+ a.pmaxsd(xmmA, anyptr_gpB);
+ a.pmaxud(xmmA, xmmB);
+ a.pmaxud(xmmA, anyptr_gpB);
+ a.pminsb(xmmA, xmmB);
+ a.pminsb(xmmA, anyptr_gpB);
+ a.pminuw(xmmA, xmmB);
+ a.pminuw(xmmA, anyptr_gpB);
+ a.pminud(xmmA, xmmB);
+ a.pminud(xmmA, anyptr_gpB);
+ a.pminsd(xmmA, xmmB);
+ a.pminsd(xmmA, anyptr_gpB);
+ a.pmovsxbw(xmmA, xmmB);
+ a.pmovsxbw(xmmA, anyptr_gpB);
+ a.pmovsxbd(xmmA, xmmB);
+ a.pmovsxbd(xmmA, anyptr_gpB);
+ a.pmovsxbq(xmmA, xmmB);
+ a.pmovsxbq(xmmA, anyptr_gpB);
+ a.pmovsxwd(xmmA, xmmB);
+ a.pmovsxwd(xmmA, anyptr_gpB);
+ a.pmovsxwq(xmmA, xmmB);
+ a.pmovsxwq(xmmA, anyptr_gpB);
+ a.pmovsxdq(xmmA, xmmB);
+ a.pmovsxdq(xmmA, anyptr_gpB);
+ a.pmovzxbw(xmmA, xmmB);
+ a.pmovzxbw(xmmA, anyptr_gpB);
+ a.pmovzxbd(xmmA, xmmB);
+ a.pmovzxbd(xmmA, anyptr_gpB);
+ a.pmovzxbq(xmmA, xmmB);
+ a.pmovzxbq(xmmA, anyptr_gpB);
+ a.pmovzxwd(xmmA, xmmB);
+ a.pmovzxwd(xmmA, anyptr_gpB);
+ a.pmovzxwq(xmmA, xmmB);
+ a.pmovzxwq(xmmA, anyptr_gpB);
+ a.pmovzxdq(xmmA, xmmB);
+ a.pmovzxdq(xmmA, anyptr_gpB);
+ a.pmuldq(xmmA, xmmB);
+ a.pmuldq(xmmA, anyptr_gpB);
+ a.pmulld(xmmA, xmmB);
+ a.pmulld(xmmA, anyptr_gpB);
+ a.ptest(xmmA, xmmB);
+ a.ptest(xmmA, anyptr_gpB);
+ a.roundps(xmmA, xmmB, 0);
+ a.roundps(xmmA, anyptr_gpB, 0);
+ a.roundss(xmmA, xmmB, 0);
+ a.roundss(xmmA, anyptr_gpB, 0);
+ a.roundpd(xmmA, xmmB, 0);
+ a.roundpd(xmmA, anyptr_gpB, 0);
+ a.roundsd(xmmA, xmmB, 0);
+ a.roundsd(xmmA, anyptr_gpB, 0);
+
+ // SSE4.2.
+ a.nop();
+
+ a.pcmpestri(xmmA, xmmB , imm(0)); // Implicit xmmA, xmmB, imm, <ECX>, <EAX>, <EDX>
+ a.pcmpestri(xmmA, xmmB , imm(0), ecx, eax, edx); // Explicit xmmA, xmmB, imm, <ECX>, <EAX>, <EDX>
+ a.pcmpestri(xmmA, anyptr_gpB, imm(0)); // Implicit xmmA, mem , imm, <ECX>, <EAX>, <EDX>
+ a.pcmpestri(xmmA, anyptr_gpB, imm(0), ecx, eax, edx); // Explicit xmmA, mem , imm, <ECX>, <EAX>, <EDX>
+ a.pcmpestrm(xmmA, xmmB , imm(0)); // Implicit xmmA, xmmB, imm, <XMM0>, <EAX>, <EDX>
+ a.pcmpestrm(xmmA, xmmB , imm(0), xmm0, eax, edx); // Explicit xmmA, xmmB, imm, <XMM0>, <EAX>, <EDX>
+ a.pcmpestrm(xmmA, anyptr_gpB, imm(0)); // Implicit xmmA, mem , imm, <XMM0>, <EAX>, <EDX>
+ a.pcmpestrm(xmmA, anyptr_gpB, imm(0), xmm0, eax, edx); // Explicit xmmA, mem , imm, <XMM0>, <EAX>, <EDX>
+ a.pcmpistri(xmmA, xmmB , imm(0)); // Implicit xmmA, xmmB, imm, <ECX>
+ a.pcmpistri(xmmA, xmmB , imm(0), ecx); // Explicit xmmA, xmmB, imm, <ECX>
+ a.pcmpistri(xmmA, anyptr_gpB, imm(0)); // Implicit xmmA, mem , imm, <ECX>
+ a.pcmpistri(xmmA, anyptr_gpB, imm(0), ecx); // Explicit xmmA, mem , imm, <ECX>
+ a.pcmpistrm(xmmA, xmmB , imm(0)); // Implicit xmmA, xmmB, imm, <XMM0>
+ a.pcmpistrm(xmmA, xmmB , imm(0), xmm0); // Explicit xmmA, xmmB, imm, <XMM0>
+ a.pcmpistrm(xmmA, anyptr_gpB, imm(0)); // Implicit xmmA, mem , imm, <XMM0>
+ a.pcmpistrm(xmmA, anyptr_gpB, imm(0), xmm0); // Explicit xmmA, mem , imm, <XMM0>
+
+ a.pcmpgtq(xmmA, xmmB);
+ a.pcmpgtq(xmmA, anyptr_gpB);
+
+ // SSE4A.
+ a.nop();
+
+ a.extrq(xmmA, xmmB);
+ a.extrq(xmmA, 0x1, 0x2);
+ a.extrq(xmmB, 0x1, 0x2);
+ a.insertq(xmmA, xmmB);
+ a.insertq(xmmA, xmmB, 0x1, 0x2);
+ a.movntsd(anyptr_gpA, xmmB);
+ a.movntss(anyptr_gpA, xmmB);
+
+ // AESNI.
+ a.nop();
+
+ a.aesdec(xmmA, xmmB);
+ a.aesdec(xmmA, anyptr_gpB);
+ a.aesdeclast(xmmA, xmmB);
+ a.aesdeclast(xmmA, anyptr_gpB);
+ a.aesenc(xmmA, xmmB);
+ a.aesenc(xmmA, anyptr_gpB);
+ a.aesenclast(xmmA, xmmB);
+ a.aesenclast(xmmA, anyptr_gpB);
+ a.aesimc(xmmA, xmmB);
+ a.aesimc(xmmA, anyptr_gpB);
+ a.aeskeygenassist(xmmA, xmmB, 0);
+ a.aeskeygenassist(xmmA, anyptr_gpB, 0);
+
+ // SHA.
+ a.nop();
+
+ a.sha1msg1(xmmA, xmmB);
+ a.sha1msg1(xmmA, anyptr_gpB);
+ a.sha1msg2(xmmA, xmmB);
+ a.sha1msg2(xmmA, anyptr_gpB);
+ a.sha1nexte(xmmA, xmmB);
+ a.sha1nexte(xmmA, anyptr_gpB);
+ a.sha1rnds4(xmmA, xmmB, 0);
+ a.sha1rnds4(xmmA, anyptr_gpB, 0);
+ a.sha256msg1(xmmA, xmmB);
+ a.sha256msg1(xmmA, anyptr_gpB);
+ a.sha256msg2(xmmA, xmmB);
+ a.sha256msg2(xmmA, anyptr_gpB);
+ a.sha256rnds2(xmmA, xmmB); // Implicit xmmA, xmmB, <XMM0>
+ a.sha256rnds2(xmmA, xmmB, xmm0); // Explicit xmmA, xmmB, <XMM0>
+ a.sha256rnds2(xmmA, anyptr_gpB); // Implicit xmmA, mem, <XMM0>
+ a.sha256rnds2(xmmA, anyptr_gpB, xmm0); // Explicit xmmA, mem, <XMM0>
+
+ // PCLMULQDQ.
+ a.nop();
+
+ a.pclmulqdq(xmmA, xmmB, 0);
+ a.pclmulqdq(xmmA, anyptr_gpB, 0);
+
+ // AVX.
+ a.nop();
+
+ a.vaddpd(xmmA, xmmB, xmmC);
+ a.vaddpd(xmmA, xmmB, anyptr_gpC);
+ a.vaddpd(ymmA, ymmB, ymmC);
+ a.vaddpd(ymmA, ymmB, anyptr_gpC);
+ a.vaddps(xmmA, xmmB, xmmC);
+ a.vaddps(xmmA, xmmB, anyptr_gpC);
+ a.vaddps(ymmA, ymmB, ymmC);
+ a.vaddps(ymmA, ymmB, anyptr_gpC);
+ a.vaddsd(xmmA, xmmB, xmmC);
+ a.vaddsd(xmmA, xmmB, anyptr_gpC);
+ a.vaddss(xmmA, xmmB, xmmC);
+ a.vaddss(xmmA, xmmB, anyptr_gpC);
+ a.vaddsubpd(xmmA, xmmB, xmmC);
+ a.vaddsubpd(xmmA, xmmB, anyptr_gpC);
+ a.vaddsubpd(ymmA, ymmB, ymmC);
+ a.vaddsubpd(ymmA, ymmB, anyptr_gpC);
+ a.vaddsubps(xmmA, xmmB, xmmC);
+ a.vaddsubps(xmmA, xmmB, anyptr_gpC);
+ a.vaddsubps(ymmA, ymmB, ymmC);
+ a.vaddsubps(ymmA, ymmB, anyptr_gpC);
+ a.vandpd(xmmA, xmmB, xmmC);
+ a.vandpd(xmmA, xmmB, anyptr_gpC);
+ a.vandpd(ymmA, ymmB, ymmC);
+ a.vandpd(ymmA, ymmB, anyptr_gpC);
+ a.vandps(xmmA, xmmB, xmmC);
+ a.vandps(xmmA, xmmB, anyptr_gpC);
+ a.vandps(ymmA, ymmB, ymmC);
+ a.vandps(ymmA, ymmB, anyptr_gpC);
+ a.vandnpd(xmmA, xmmB, xmmC);
+ a.vandnpd(xmmA, xmmB, anyptr_gpC);
+ a.vandnpd(ymmA, ymmB, ymmC);
+ a.vandnpd(ymmA, ymmB, anyptr_gpC);
+ a.vandnps(xmmA, xmmB, xmmC);
+ a.vandnps(xmmA, xmmB, anyptr_gpC);
+ a.vandnps(ymmA, ymmB, ymmC);
+ a.vandnps(ymmA, ymmB, anyptr_gpC);
+ a.vblendpd(xmmA, xmmB, xmmC, 0);
+ a.vblendpd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vblendpd(ymmA, ymmB, ymmC, 0);
+ a.vblendpd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vblendps(xmmA, xmmB, xmmC, 0);
+ a.vblendps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vblendps(ymmA, ymmB, ymmC, 0);
+ a.vblendps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vblendvpd(xmmA, xmmB, xmmC, xmmD);
+ a.vblendvpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vblendvpd(ymmA, ymmB, ymmC, ymmD);
+ a.vblendvpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vbroadcastf128(ymmA, anyptr_gpB);
+ a.vbroadcastsd(ymmA, anyptr_gpB);
+ a.vbroadcastss(xmmA, anyptr_gpB);
+ a.vbroadcastss(ymmA, anyptr_gpB);
+ a.vcmppd(xmmA, xmmB, xmmC, 0);
+ a.vcmppd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vcmppd(ymmA, ymmB, ymmC, 0);
+ a.vcmppd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vcmpps(xmmA, xmmB, xmmC, 0);
+ a.vcmpps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vcmpps(ymmA, ymmB, ymmC, 0);
+ a.vcmpps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vcmpsd(xmmA, xmmB, xmmC, 0);
+ a.vcmpsd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vcmpss(xmmA, xmmB, xmmC, 0);
+ a.vcmpss(xmmA, xmmB, anyptr_gpC, 0);
+ a.vcomisd(xmmA, xmmB);
+ a.vcomisd(xmmA, anyptr_gpB);
+ a.vcomiss(xmmA, xmmB);
+ a.vcomiss(xmmA, anyptr_gpB);
+ a.vcvtdq2pd(xmmA, xmmB);
+ a.vcvtdq2pd(xmmA, anyptr_gpB);
+ a.vcvtdq2pd(ymmA, xmmB);
+ a.vcvtdq2pd(ymmA, anyptr_gpB);
+ a.vcvtdq2ps(xmmA, xmmB);
+ a.vcvtdq2ps(xmmA, anyptr_gpB);
+ a.vcvtdq2ps(ymmA, ymmB);
+ a.vcvtdq2ps(ymmA, anyptr_gpB);
+ a.vcvtpd2dq(xmmA, xmmB);
+ a.vcvtpd2dq(xmmA, ymmB);
+ a.vcvtpd2dq(xmmA, anyptr_gpB);
+ a.vcvtpd2ps(xmmA, xmmB);
+ a.vcvtpd2ps(xmmA, ymmB);
+ a.vcvtpd2ps(xmmA, anyptr_gpB);
+ a.vcvtps2dq(xmmA, xmmB);
+ a.vcvtps2dq(xmmA, anyptr_gpB);
+ a.vcvtps2dq(ymmA, ymmB);
+ a.vcvtps2dq(ymmA, anyptr_gpB);
+ a.vcvtps2pd(xmmA, xmmB);
+ a.vcvtps2pd(xmmA, anyptr_gpB);
+ a.vcvtps2pd(ymmA, xmmB);
+ a.vcvtps2pd(ymmA, anyptr_gpB);
+ a.vcvtsd2si(gzA, xmmB);
+ a.vcvtsd2si(gzA, anyptr_gpB);
+ a.vcvtsd2ss(xmmA, xmmB, xmmC);
+ a.vcvtsd2ss(xmmA, xmmB, anyptr_gpC);
+ a.vcvtsi2sd(xmmA, xmmB, gzC);
+ a.vcvtsi2sd(xmmA, xmmB, anyptr_gpC);
+ a.vcvtsi2ss(xmmA, xmmB, gzC);
+ a.vcvtsi2ss(xmmA, xmmB, anyptr_gpC);
+ a.vcvtss2sd(xmmA, xmmB, xmmC);
+ a.vcvtss2sd(xmmA, xmmB, anyptr_gpC);
+ a.vcvtss2si(gzA, xmmB);
+ a.vcvtss2si(gzA, anyptr_gpB);
+ a.vcvttpd2dq(xmmA, xmmB);
+ a.vcvttpd2dq(xmmA, ymmB);
+ a.vcvttpd2dq(xmmA, anyptr_gpB);
+ a.vcvttps2dq(xmmA, xmmB);
+ a.vcvttps2dq(xmmA, anyptr_gpB);
+ a.vcvttps2dq(ymmA, ymmB);
+ a.vcvttps2dq(ymmA, anyptr_gpB);
+ a.vcvttsd2si(gzA, xmmB);
+ a.vcvttsd2si(gzA, anyptr_gpB);
+ a.vcvttss2si(gzA, xmmB);
+ a.vcvttss2si(gzA, anyptr_gpB);
+ a.vdivpd(xmmA, xmmB, xmmC);
+ a.vdivpd(xmmA, xmmB, anyptr_gpC);
+ a.vdivpd(ymmA, ymmB, ymmC);
+ a.vdivpd(ymmA, ymmB, anyptr_gpC);
+ a.vdivps(xmmA, xmmB, xmmC);
+ a.vdivps(xmmA, xmmB, anyptr_gpC);
+ a.vdivps(ymmA, ymmB, ymmC);
+ a.vdivps(ymmA, ymmB, anyptr_gpC);
+ a.vdivsd(xmmA, xmmB, xmmC);
+ a.vdivsd(xmmA, xmmB, anyptr_gpC);
+ a.vdivss(xmmA, xmmB, xmmC);
+ a.vdivss(xmmA, xmmB, anyptr_gpC);
+ a.vdppd(xmmA, xmmB, xmmC, 0);
+ a.vdppd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vdpps(xmmA, xmmB, xmmC, 0);
+ a.vdpps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vdpps(ymmA, ymmB, ymmC, 0);
+ a.vdpps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vextractf128(xmmA, ymmB, 0);
+ a.vextractf128(anyptr_gpA, ymmB, 0);
+ a.vextractps(gzA, xmmB, 0);
+ a.vextractps(anyptr_gpA, xmmB, 0);
+ a.vhaddpd(xmmA, xmmB, xmmC);
+ a.vhaddpd(xmmA, xmmB, anyptr_gpC);
+ a.vhaddpd(ymmA, ymmB, ymmC);
+ a.vhaddpd(ymmA, ymmB, anyptr_gpC);
+ a.vhaddps(xmmA, xmmB, xmmC);
+ a.vhaddps(xmmA, xmmB, anyptr_gpC);
+ a.vhaddps(ymmA, ymmB, ymmC);
+ a.vhaddps(ymmA, ymmB, anyptr_gpC);
+ a.vhsubpd(xmmA, xmmB, xmmC);
+ a.vhsubpd(xmmA, xmmB, anyptr_gpC);
+ a.vhsubpd(ymmA, ymmB, ymmC);
+ a.vhsubpd(ymmA, ymmB, anyptr_gpC);
+ a.vhsubps(xmmA, xmmB, xmmC);
+ a.vhsubps(xmmA, xmmB, anyptr_gpC);
+ a.vhsubps(ymmA, ymmB, ymmC);
+ a.vhsubps(ymmA, ymmB, anyptr_gpC);
+ a.vinsertf128(ymmA, ymmB, xmmC, 0);
+ a.vinsertf128(ymmA, ymmB, anyptr_gpC, 0);
+ a.vinsertps(xmmA, xmmB, xmmC, 0);
+ a.vinsertps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vlddqu(xmmA, anyptr_gpB);
+ a.vlddqu(ymmA, anyptr_gpB);
+ a.vldmxcsr(anyptr_gpA);
+ a.vmaskmovdqu(xmmA, xmmB); // Implicit xmmA, xmmB, <ds:[EDI|RDI]>
+ a.vmaskmovdqu(xmmA, xmmB, ptr(a.zdi())); // Explicit xmmA, xmmB, <ds:[EDI|RDI]>
+ a.vmaskmovps(xmmA, xmmB, anyptr_gpC);
+ a.vmaskmovps(ymmA, ymmB, anyptr_gpC);
+ a.vmaskmovps(anyptr_gpA, xmmB, xmmC);
+ a.vmaskmovps(anyptr_gpA, ymmB, ymmC);
+ a.vmaskmovpd(xmmA, xmmB, anyptr_gpC);
+ a.vmaskmovpd(ymmA, ymmB, anyptr_gpC);
+ a.vmaskmovpd(anyptr_gpA, xmmB, xmmC);
+ a.vmaskmovpd(anyptr_gpA, ymmB, ymmC);
+ a.vmaxpd(xmmA, xmmB, xmmC);
+ a.vmaxpd(xmmA, xmmB, anyptr_gpC);
+ a.vmaxpd(ymmA, ymmB, ymmC);
+ a.vmaxpd(ymmA, ymmB, anyptr_gpC);
+ a.vmaxps(xmmA, xmmB, xmmC);
+ a.vmaxps(xmmA, xmmB, anyptr_gpC);
+ a.vmaxps(ymmA, ymmB, ymmC);
+ a.vmaxps(ymmA, ymmB, anyptr_gpC);
+ a.vmaxsd(xmmA, xmmB, xmmC);
+ a.vmaxsd(xmmA, xmmB, anyptr_gpC);
+ a.vmaxss(xmmA, xmmB, xmmC);
+ a.vmaxss(xmmA, xmmB, anyptr_gpC);
+ a.vminpd(xmmA, xmmB, xmmC);
+ a.vminpd(xmmA, xmmB, anyptr_gpC);
+ a.vminpd(ymmA, ymmB, ymmC);
+ a.vminpd(ymmA, ymmB, anyptr_gpC);
+ a.vminps(xmmA, xmmB, xmmC);
+ a.vminps(xmmA, xmmB, anyptr_gpC);
+ a.vminps(ymmA, ymmB, ymmC);
+ a.vminps(ymmA, ymmB, anyptr_gpC);
+ a.vminsd(xmmA, xmmB, xmmC);
+ a.vminsd(xmmA, xmmB, anyptr_gpC);
+ a.vminss(xmmA, xmmB, xmmC);
+ a.vminss(xmmA, xmmB, anyptr_gpC);
+ a.vmovapd(xmmA, xmmB);
+ a.vmovapd(xmmA, anyptr_gpB);
+ a.vmovapd(anyptr_gpA, xmmB);
+ a.vmovapd(ymmA, ymmB);
+ a.vmovapd(ymmA, anyptr_gpB);
+ a.vmovapd(anyptr_gpA, ymmB);
+ a.vmovaps(xmmA, xmmB);
+ a.vmovaps(xmmA, anyptr_gpB);
+ a.vmovaps(anyptr_gpA, xmmB);
+ a.vmovaps(ymmA, ymmB);
+ a.vmovaps(ymmA, anyptr_gpB);
+ a.vmovaps(anyptr_gpA, ymmB);
+ a.vmovd(xmmA, gzB);
+ a.vmovd(xmmA, anyptr_gpB);
+ a.vmovd(gzA, xmmB);
+ a.vmovd(anyptr_gpA, xmmB);
+ a.vmovddup(xmmA, xmmB);
+ a.vmovddup(xmmA, anyptr_gpB);
+ a.vmovddup(ymmA, ymmB);
+ a.vmovddup(ymmA, anyptr_gpB);
+ a.vmovdqa(xmmA, xmmB);
+ a.vmovdqa(xmmA, anyptr_gpB);
+ a.vmovdqa(anyptr_gpA, xmmB);
+ a.vmovdqa(ymmA, ymmB);
+ a.vmovdqa(ymmA, anyptr_gpB);
+ a.vmovdqa(anyptr_gpA, ymmB);
+ a.vmovdqu(xmmA, xmmB);
+ a.vmovdqu(xmmA, anyptr_gpB);
+ a.vmovdqu(anyptr_gpA, xmmB);
+ a.vmovdqu(ymmA, ymmB);
+ a.vmovdqu(ymmA, anyptr_gpB);
+ a.vmovdqu(anyptr_gpA, ymmB);
+ a.vmovhlps(xmmA, xmmB, xmmC);
+ a.vmovhpd(xmmA, xmmB, anyptr_gpC);
+ a.vmovhpd(anyptr_gpA, xmmB);
+ a.vmovhps(xmmA, xmmB, anyptr_gpC);
+ a.vmovhps(anyptr_gpA, xmmB);
+ a.vmovlhps(xmmA, xmmB, xmmC);
+ a.vmovlpd(xmmA, xmmB, anyptr_gpC);
+ a.vmovlpd(anyptr_gpA, xmmB);
+ a.vmovlps(xmmA, xmmB, anyptr_gpC);
+ a.vmovlps(anyptr_gpA, xmmB);
+ a.vmovmskpd(gzA, xmmB);
+ a.vmovmskpd(gzA, ymmB);
+ a.vmovmskps(gzA, xmmB);
+ a.vmovmskps(gzA, ymmB);
+ a.vmovntdq(anyptr_gpA, xmmB);
+ a.vmovntdq(anyptr_gpA, ymmB);
+ a.vmovntdqa(xmmA, anyptr_gpB);
+ a.vmovntpd(anyptr_gpA, xmmB);
+ a.vmovntpd(anyptr_gpA, ymmB);
+ a.vmovntps(anyptr_gpA, xmmB);
+ a.vmovntps(anyptr_gpA, ymmB);
+ a.vmovsd(xmmA, xmmB, xmmC);
+ a.vmovsd(xmmA, anyptr_gpB);
+ a.vmovsd(anyptr_gpA, xmmB);
+ a.vmovshdup(xmmA, xmmB);
+ a.vmovshdup(xmmA, anyptr_gpB);
+ a.vmovshdup(ymmA, ymmB);
+ a.vmovshdup(ymmA, anyptr_gpB);
+ a.vmovsldup(xmmA, xmmB);
+ a.vmovsldup(xmmA, anyptr_gpB);
+ a.vmovsldup(ymmA, ymmB);
+ a.vmovsldup(ymmA, anyptr_gpB);
+ a.vmovss(xmmA, xmmB, xmmC);
+ a.vmovss(xmmA, anyptr_gpB);
+ a.vmovss(anyptr_gpA, xmmB);
+ a.vmovupd(xmmA, xmmB);
+ a.vmovupd(xmmA, anyptr_gpB);
+ a.vmovupd(anyptr_gpA, xmmB);
+ a.vmovupd(ymmA, ymmB);
+ a.vmovupd(ymmA, anyptr_gpB);
+ a.vmovupd(anyptr_gpA, ymmB);
+ a.vmovups(xmmA, xmmB);
+ a.vmovups(xmmA, anyptr_gpB);
+ a.vmovups(anyptr_gpA, xmmB);
+ a.vmovups(ymmA, ymmB);
+ a.vmovups(ymmA, anyptr_gpB);
+ a.vmovups(anyptr_gpA, ymmB);
+ a.vmpsadbw(xmmA, xmmB, xmmC, 0);
+ a.vmpsadbw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vmulpd(xmmA, xmmB, xmmC);
+ a.vmulpd(xmmA, xmmB, anyptr_gpC);
+ a.vmulpd(ymmA, ymmB, ymmC);
+ a.vmulpd(ymmA, ymmB, anyptr_gpC);
+ a.vmulps(xmmA, xmmB, xmmC);
+ a.vmulps(xmmA, xmmB, anyptr_gpC);
+ a.vmulps(ymmA, ymmB, ymmC);
+ a.vmulps(ymmA, ymmB, anyptr_gpC);
+ a.vmulsd(xmmA, xmmB, xmmC);
+ a.vmulsd(xmmA, xmmB, anyptr_gpC);
+ a.vmulss(xmmA, xmmB, xmmC);
+ a.vmulss(xmmA, xmmB, anyptr_gpC);
+ a.vorpd(xmmA, xmmB, xmmC);
+ a.vorpd(xmmA, xmmB, anyptr_gpC);
+ a.vorpd(ymmA, ymmB, ymmC);
+ a.vorpd(ymmA, ymmB, anyptr_gpC);
+ a.vorps(xmmA, xmmB, xmmC);
+ a.vorps(xmmA, xmmB, anyptr_gpC);
+ a.vorps(ymmA, ymmB, ymmC);
+ a.vorps(ymmA, ymmB, anyptr_gpC);
+ a.vpabsb(xmmA, xmmB);
+ a.vpabsb(xmmA, anyptr_gpB);
+ a.vpabsd(xmmA, xmmB);
+ a.vpabsd(xmmA, anyptr_gpB);
+ a.vpabsw(xmmA, xmmB);
+ a.vpabsw(xmmA, anyptr_gpB);
+ a.vpackssdw(xmmA, xmmB, xmmC);
+ a.vpackssdw(xmmA, xmmB, anyptr_gpC);
+ a.vpacksswb(xmmA, xmmB, xmmC);
+ a.vpacksswb(xmmA, xmmB, anyptr_gpC);
+ a.vpackusdw(xmmA, xmmB, xmmC);
+ a.vpackusdw(xmmA, xmmB, anyptr_gpC);
+ a.vpackuswb(xmmA, xmmB, xmmC);
+ a.vpackuswb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddb(xmmA, xmmB, xmmC);
+ a.vpaddb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddd(xmmA, xmmB, xmmC);
+ a.vpaddd(xmmA, xmmB, anyptr_gpC);
+ a.vpaddq(xmmA, xmmB, xmmC);
+ a.vpaddq(xmmA, xmmB, anyptr_gpC);
+ a.vpaddw(xmmA, xmmB, xmmC);
+ a.vpaddw(xmmA, xmmB, anyptr_gpC);
+ a.vpaddsb(xmmA, xmmB, xmmC);
+ a.vpaddsb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddsw(xmmA, xmmB, xmmC);
+ a.vpaddsw(xmmA, xmmB, anyptr_gpC);
+ a.vpaddusb(xmmA, xmmB, xmmC);
+ a.vpaddusb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddusw(xmmA, xmmB, xmmC);
+ a.vpaddusw(xmmA, xmmB, anyptr_gpC);
+ a.vpalignr(xmmA, xmmB, xmmC, 0);
+ a.vpalignr(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpand(xmmA, xmmB, xmmC);
+ a.vpand(xmmA, xmmB, anyptr_gpC);
+ a.vpandn(xmmA, xmmB, xmmC);
+ a.vpandn(xmmA, xmmB, anyptr_gpC);
+ a.vpavgb(xmmA, xmmB, xmmC);
+ a.vpavgb(xmmA, xmmB, anyptr_gpC);
+ a.vpavgw(xmmA, xmmB, xmmC);
+ a.vpavgw(xmmA, xmmB, anyptr_gpC);
+ a.vpblendvb(xmmA, xmmB, xmmC, xmmD);
+ a.vpblendvb(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpblendw(xmmA, xmmB, xmmC, 0);
+ a.vpblendw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcmpeqb(xmmA, xmmB, xmmC);
+ a.vpcmpeqb(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpeqd(xmmA, xmmB, xmmC);
+ a.vpcmpeqd(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpeqq(xmmA, xmmB, xmmC);
+ a.vpcmpeqq(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpeqw(xmmA, xmmB, xmmC);
+ a.vpcmpeqw(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpgtb(xmmA, xmmB, xmmC);
+ a.vpcmpgtb(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpgtd(xmmA, xmmB, xmmC);
+ a.vpcmpgtd(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpgtq(xmmA, xmmB, xmmC);
+ a.vpcmpgtq(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpgtw(xmmA, xmmB, xmmC);
+ a.vpcmpgtw(xmmA, xmmB, anyptr_gpC);
+ a.vpcmpestri(xmmA, xmmB, 0);
+ a.vpcmpestri(xmmA, anyptr_gpB, 0);
+ a.vpcmpestrm(xmmA, xmmB, 0);
+ a.vpcmpestrm(xmmA, anyptr_gpB, 0);
+ a.vpcmpistri(xmmA, xmmB, 0);
+ a.vpcmpistri(xmmA, anyptr_gpB, 0);
+ a.vpcmpistrm(xmmA, xmmB, 0);
+ a.vpcmpistrm(xmmA, anyptr_gpB, 0);
+ a.vpermilpd(xmmA, xmmB, xmmC);
+ a.vpermilpd(xmmA, xmmB, anyptr_gpC);
+ a.vpermilpd(ymmA, ymmB, ymmC);
+ a.vpermilpd(ymmA, ymmB, anyptr_gpC);
+ a.vpermilpd(xmmA, xmmB, 0);
+ a.vpermilpd(xmmA, anyptr_gpB, 0);
+ a.vpermilpd(ymmA, ymmB, 0);
+ a.vpermilpd(ymmA, anyptr_gpB, 0);
+ a.vpermilps(xmmA, xmmB, xmmC);
+ a.vpermilps(xmmA, xmmB, anyptr_gpC);
+ a.vpermilps(ymmA, ymmB, ymmC);
+ a.vpermilps(ymmA, ymmB, anyptr_gpC);
+ a.vpermilps(xmmA, xmmB, 0);
+ a.vpermilps(xmmA, anyptr_gpB, 0);
+ a.vpermilps(ymmA, ymmB, 0);
+ a.vpermilps(ymmA, anyptr_gpB, 0);
+ a.vperm2f128(ymmA, ymmB, ymmC, 0);
+ a.vperm2f128(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpextrb(gzA, xmmB, 0);
+ a.vpextrb(anyptr_gpA, xmmB, 0);
+ a.vpextrd(gzA, xmmB, 0);
+ a.vpextrd(anyptr_gpA, xmmB, 0);
+ if (isX64) a.vpextrq(gzA, xmmB, 0);
+ if (isX64) a.vpextrq(anyptr_gpA, xmmB, 0);
+ a.vpextrw(gzA, xmmB, 0);
+ a.vpextrw(anyptr_gpA, xmmB, 0);
+ a.vphaddd(xmmA, xmmB, xmmC);
+ a.vphaddd(xmmA, xmmB, anyptr_gpC);
+ a.vphaddsw(xmmA, xmmB, xmmC);
+ a.vphaddsw(xmmA, xmmB, anyptr_gpC);
+ a.vphaddw(xmmA, xmmB, xmmC);
+ a.vphaddw(xmmA, xmmB, anyptr_gpC);
+ a.vphminposuw(xmmA, xmmB);
+ a.vphminposuw(xmmA, anyptr_gpB);
+ a.vphsubd(xmmA, xmmB, xmmC);
+ a.vphsubd(xmmA, xmmB, anyptr_gpC);
+ a.vphsubsw(xmmA, xmmB, xmmC);
+ a.vphsubsw(xmmA, xmmB, anyptr_gpC);
+ a.vphsubw(xmmA, xmmB, xmmC);
+ a.vphsubw(xmmA, xmmB, anyptr_gpC);
+ a.vpinsrb(xmmA, xmmB, gzC, 0);
+ a.vpinsrb(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpinsrd(xmmA, xmmB, gzC, 0);
+ a.vpinsrd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpinsrw(xmmA, xmmB, gzC, 0);
+ a.vpinsrw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpmaddubsw(xmmA, xmmB, xmmC);
+ a.vpmaddubsw(xmmA, xmmB, anyptr_gpC);
+ a.vpmaddwd(xmmA, xmmB, xmmC);
+ a.vpmaddwd(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsb(xmmA, xmmB, xmmC);
+ a.vpmaxsb(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsd(xmmA, xmmB, xmmC);
+ a.vpmaxsd(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsw(xmmA, xmmB, xmmC);
+ a.vpmaxsw(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxub(xmmA, xmmB, xmmC);
+ a.vpmaxub(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxud(xmmA, xmmB, xmmC);
+ a.vpmaxud(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxuw(xmmA, xmmB, xmmC);
+ a.vpmaxuw(xmmA, xmmB, anyptr_gpC);
+ a.vpminsb(xmmA, xmmB, xmmC);
+ a.vpminsb(xmmA, xmmB, anyptr_gpC);
+ a.vpminsd(xmmA, xmmB, xmmC);
+ a.vpminsd(xmmA, xmmB, anyptr_gpC);
+ a.vpminsw(xmmA, xmmB, xmmC);
+ a.vpminsw(xmmA, xmmB, anyptr_gpC);
+ a.vpminub(xmmA, xmmB, xmmC);
+ a.vpminub(xmmA, xmmB, anyptr_gpC);
+ a.vpminud(xmmA, xmmB, xmmC);
+ a.vpminud(xmmA, xmmB, anyptr_gpC);
+ a.vpminuw(xmmA, xmmB, xmmC);
+ a.vpminuw(xmmA, xmmB, anyptr_gpC);
+ a.vpmovmskb(gzA, xmmB);
+ a.vpmovsxbd(xmmA, xmmB);
+ a.vpmovsxbd(xmmA, anyptr_gpB);
+ a.vpmovsxbq(xmmA, xmmB);
+ a.vpmovsxbq(xmmA, anyptr_gpB);
+ a.vpmovsxbw(xmmA, xmmB);
+ a.vpmovsxbw(xmmA, anyptr_gpB);
+ a.vpmovsxdq(xmmA, xmmB);
+ a.vpmovsxdq(xmmA, anyptr_gpB);
+ a.vpmovsxwd(xmmA, xmmB);
+ a.vpmovsxwd(xmmA, anyptr_gpB);
+ a.vpmovsxwq(xmmA, xmmB);
+ a.vpmovsxwq(xmmA, anyptr_gpB);
+ a.vpmovzxbd(xmmA, xmmB);
+ a.vpmovzxbd(xmmA, anyptr_gpB);
+ a.vpmovzxbq(xmmA, xmmB);
+ a.vpmovzxbq(xmmA, anyptr_gpB);
+ a.vpmovzxbw(xmmA, xmmB);
+ a.vpmovzxbw(xmmA, anyptr_gpB);
+ a.vpmovzxdq(xmmA, xmmB);
+ a.vpmovzxdq(xmmA, anyptr_gpB);
+ a.vpmovzxwd(xmmA, xmmB);
+ a.vpmovzxwd(xmmA, anyptr_gpB);
+ a.vpmovzxwq(xmmA, xmmB);
+ a.vpmovzxwq(xmmA, anyptr_gpB);
+ a.vpmuldq(xmmA, xmmB, xmmC);
+ a.vpmuldq(xmmA, xmmB, anyptr_gpC);
+ a.vpmulhrsw(xmmA, xmmB, xmmC);
+ a.vpmulhrsw(xmmA, xmmB, anyptr_gpC);
+ a.vpmulhuw(xmmA, xmmB, xmmC);
+ a.vpmulhuw(xmmA, xmmB, anyptr_gpC);
+ a.vpmulhw(xmmA, xmmB, xmmC);
+ a.vpmulhw(xmmA, xmmB, anyptr_gpC);
+ a.vpmulld(xmmA, xmmB, xmmC);
+ a.vpmulld(xmmA, xmmB, anyptr_gpC);
+ a.vpmullw(xmmA, xmmB, xmmC);
+ a.vpmullw(xmmA, xmmB, anyptr_gpC);
+ a.vpmuludq(xmmA, xmmB, xmmC);
+ a.vpmuludq(xmmA, xmmB, anyptr_gpC);
+ a.vpor(xmmA, xmmB, xmmC);
+ a.vpor(xmmA, xmmB, anyptr_gpC);
+ a.vpsadbw(xmmA, xmmB, xmmC);
+ a.vpsadbw(xmmA, xmmB, anyptr_gpC);
+ a.vpshufb(xmmA, xmmB, xmmC);
+ a.vpshufb(xmmA, xmmB, anyptr_gpC);
+ a.vpshufd(xmmA, xmmB, 0);
+ a.vpshufd(xmmA, anyptr_gpB, 0);
+ a.vpshufhw(xmmA, xmmB, 0);
+ a.vpshufhw(xmmA, anyptr_gpB, 0);
+ a.vpshuflw(xmmA, xmmB, 0);
+ a.vpshuflw(xmmA, anyptr_gpB, 0);
+ a.vpsignb(xmmA, xmmB, xmmC);
+ a.vpsignb(xmmA, xmmB, anyptr_gpC);
+ a.vpsignd(xmmA, xmmB, xmmC);
+ a.vpsignd(xmmA, xmmB, anyptr_gpC);
+ a.vpsignw(xmmA, xmmB, xmmC);
+ a.vpsignw(xmmA, xmmB, anyptr_gpC);
+ a.vpslld(xmmA, xmmB, xmmC);
+ a.vpslld(xmmA, xmmB, anyptr_gpC);
+ a.vpslld(xmmA, xmmB, 0);
+ a.vpslldq(xmmA, xmmB, 0);
+ a.vpsllq(xmmA, xmmB, xmmC);
+ a.vpsllq(xmmA, xmmB, anyptr_gpC);
+ a.vpsllq(xmmA, xmmB, 0);
+ a.vpsllw(xmmA, xmmB, xmmC);
+ a.vpsllw(xmmA, xmmB, anyptr_gpC);
+ a.vpsllw(xmmA, xmmB, 0);
+ a.vpsrad(xmmA, xmmB, xmmC);
+ a.vpsrad(xmmA, xmmB, anyptr_gpC);
+ a.vpsrad(xmmA, xmmB, 0);
+ a.vpsraw(xmmA, xmmB, xmmC);
+ a.vpsraw(xmmA, xmmB, anyptr_gpC);
+ a.vpsraw(xmmA, xmmB, 0);
+ a.vpsrld(xmmA, xmmB, xmmC);
+ a.vpsrld(xmmA, xmmB, anyptr_gpC);
+ a.vpsrld(xmmA, xmmB, 0);
+ a.vpsrldq(xmmA, xmmB, 0);
+ a.vpsrlq(xmmA, xmmB, xmmC);
+ a.vpsrlq(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlq(xmmA, xmmB, 0);
+ a.vpsrlw(xmmA, xmmB, xmmC);
+ a.vpsrlw(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlw(xmmA, xmmB, 0);
+ a.vpsubb(xmmA, xmmB, xmmC);
+ a.vpsubb(xmmA, xmmB, anyptr_gpC);
+ a.vpsubd(xmmA, xmmB, xmmC);
+ a.vpsubd(xmmA, xmmB, anyptr_gpC);
+ a.vpsubq(xmmA, xmmB, xmmC);
+ a.vpsubq(xmmA, xmmB, anyptr_gpC);
+ a.vpsubw(xmmA, xmmB, xmmC);
+ a.vpsubw(xmmA, xmmB, anyptr_gpC);
+ a.vpsubsb(xmmA, xmmB, xmmC);
+ a.vpsubsb(xmmA, xmmB, anyptr_gpC);
+ a.vpsubsw(xmmA, xmmB, xmmC);
+ a.vpsubsw(xmmA, xmmB, anyptr_gpC);
+ a.vpsubusb(xmmA, xmmB, xmmC);
+ a.vpsubusb(xmmA, xmmB, anyptr_gpC);
+ a.vpsubusw(xmmA, xmmB, xmmC);
+ a.vpsubusw(xmmA, xmmB, anyptr_gpC);
+ a.vptest(xmmA, xmmB);
+ a.vptest(xmmA, anyptr_gpB);
+ a.vptest(ymmA, ymmB);
+ a.vptest(ymmA, anyptr_gpB);
+ a.vpunpckhbw(xmmA, xmmB, xmmC);
+ a.vpunpckhbw(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhdq(xmmA, xmmB, xmmC);
+ a.vpunpckhdq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhqdq(xmmA, xmmB, xmmC);
+ a.vpunpckhqdq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhwd(xmmA, xmmB, xmmC);
+ a.vpunpckhwd(xmmA, xmmB, anyptr_gpC);
+ a.vpunpcklbw(xmmA, xmmB, xmmC);
+ a.vpunpcklbw(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckldq(xmmA, xmmB, xmmC);
+ a.vpunpckldq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpcklqdq(xmmA, xmmB, xmmC);
+ a.vpunpcklqdq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpcklwd(xmmA, xmmB, xmmC);
+ a.vpunpcklwd(xmmA, xmmB, anyptr_gpC);
+ a.vpxor(xmmA, xmmB, xmmC);
+ a.vpxor(xmmA, xmmB, anyptr_gpC);
+ a.vrcpps(xmmA, xmmB);
+ a.vrcpps(xmmA, anyptr_gpB);
+ a.vrcpps(ymmA, ymmB);
+ a.vrcpps(ymmA, anyptr_gpB);
+ a.vrcpss(xmmA, xmmB, xmmC);
+ a.vrcpss(xmmA, xmmB, anyptr_gpC);
+ a.vrsqrtps(xmmA, xmmB);
+ a.vrsqrtps(xmmA, anyptr_gpB);
+ a.vrsqrtps(ymmA, ymmB);
+ a.vrsqrtps(ymmA, anyptr_gpB);
+ a.vrsqrtss(xmmA, xmmB, xmmC);
+ a.vrsqrtss(xmmA, xmmB, anyptr_gpC);
+ a.vroundpd(xmmA, xmmB, 0);
+ a.vroundpd(xmmA, anyptr_gpB, 0);
+ a.vroundpd(ymmA, ymmB, 0);
+ a.vroundpd(ymmA, anyptr_gpB, 0);
+ a.vroundps(xmmA, xmmB, 0);
+ a.vroundps(xmmA, anyptr_gpB, 0);
+ a.vroundps(ymmA, ymmB, 0);
+ a.vroundps(ymmA, anyptr_gpB, 0);
+ a.vroundsd(xmmA, xmmB, xmmC, 0);
+ a.vroundsd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vroundss(xmmA, xmmB, xmmC, 0);
+ a.vroundss(xmmA, xmmB, anyptr_gpC, 0);
+ a.vshufpd(xmmA, xmmB, xmmC, 0);
+ a.vshufpd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vshufpd(ymmA, ymmB, ymmC, 0);
+ a.vshufpd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshufps(xmmA, xmmB, xmmC, 0);
+ a.vshufps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vshufps(ymmA, ymmB, ymmC, 0);
+ a.vshufps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vsqrtpd(xmmA, xmmB);
+ a.vsqrtpd(xmmA, anyptr_gpB);
+ a.vsqrtpd(ymmA, ymmB);
+ a.vsqrtpd(ymmA, anyptr_gpB);
+ a.vsqrtps(xmmA, xmmB);
+ a.vsqrtps(xmmA, anyptr_gpB);
+ a.vsqrtps(ymmA, ymmB);
+ a.vsqrtps(ymmA, anyptr_gpB);
+ a.vsqrtsd(xmmA, xmmB, xmmC);
+ a.vsqrtsd(xmmA, xmmB, anyptr_gpC);
+ a.vsqrtss(xmmA, xmmB, xmmC);
+ a.vsqrtss(xmmA, xmmB, anyptr_gpC);
+ a.vstmxcsr(anyptr_gpA);
+ a.vsubpd(xmmA, xmmB, xmmC);
+ a.vsubpd(xmmA, xmmB, anyptr_gpC);
+ a.vsubpd(ymmA, ymmB, ymmC);
+ a.vsubpd(ymmA, ymmB, anyptr_gpC);
+ a.vsubps(xmmA, xmmB, xmmC);
+ a.vsubps(xmmA, xmmB, anyptr_gpC);
+ a.vsubps(ymmA, ymmB, ymmC);
+ a.vsubps(ymmA, ymmB, anyptr_gpC);
+ a.vsubsd(xmmA, xmmB, xmmC);
+ a.vsubsd(xmmA, xmmB, anyptr_gpC);
+ a.vsubss(xmmA, xmmB, xmmC);
+ a.vsubss(xmmA, xmmB, anyptr_gpC);
+ a.vtestps(xmmA, xmmB);
+ a.vtestps(xmmA, anyptr_gpB);
+ a.vtestps(ymmA, ymmB);
+ a.vtestps(ymmA, anyptr_gpB);
+ a.vtestpd(xmmA, xmmB);
+ a.vtestpd(xmmA, anyptr_gpB);
+ a.vtestpd(ymmA, ymmB);
+ a.vtestpd(ymmA, anyptr_gpB);
+ a.vucomisd(xmmA, xmmB);
+ a.vucomisd(xmmA, anyptr_gpB);
+ a.vucomiss(xmmA, xmmB);
+ a.vucomiss(xmmA, anyptr_gpB);
+ a.vunpckhpd(xmmA, xmmB, xmmC);
+ a.vunpckhpd(xmmA, xmmB, anyptr_gpC);
+ a.vunpckhpd(ymmA, ymmB, ymmC);
+ a.vunpckhpd(ymmA, ymmB, anyptr_gpC);
+ a.vunpckhps(xmmA, xmmB, xmmC);
+ a.vunpckhps(xmmA, xmmB, anyptr_gpC);
+ a.vunpckhps(ymmA, ymmB, ymmC);
+ a.vunpckhps(ymmA, ymmB, anyptr_gpC);
+ a.vunpcklpd(xmmA, xmmB, xmmC);
+ a.vunpcklpd(xmmA, xmmB, anyptr_gpC);
+ a.vunpcklpd(ymmA, ymmB, ymmC);
+ a.vunpcklpd(ymmA, ymmB, anyptr_gpC);
+ a.vunpcklps(xmmA, xmmB, xmmC);
+ a.vunpcklps(xmmA, xmmB, anyptr_gpC);
+ a.vunpcklps(ymmA, ymmB, ymmC);
+ a.vunpcklps(ymmA, ymmB, anyptr_gpC);
+ a.vxorpd(xmmA, xmmB, xmmC);
+ a.vxorpd(xmmA, xmmB, anyptr_gpC);
+ a.vxorpd(ymmA, ymmB, ymmC);
+ a.vxorpd(ymmA, ymmB, anyptr_gpC);
+ a.vxorps(xmmA, xmmB, xmmC);
+ a.vxorps(xmmA, xmmB, anyptr_gpC);
+ a.vxorps(ymmA, ymmB, ymmC);
+ a.vxorps(ymmA, ymmB, anyptr_gpC);
+ a.vzeroall();
+ a.vex3().vzeroall();
+ a.vzeroupper();
+ a.vex3().vzeroupper();
+
+ // AVX+AESNI.
+ a.nop();
+
+ a.vaesdec(xmmA, xmmB, xmmC);
+ a.vaesdec(xmmA, xmmB, anyptr_gpC);
+ a.vaesdeclast(xmmA, xmmB, xmmC);
+ a.vaesdeclast(xmmA, xmmB, anyptr_gpC);
+ a.vaesenc(xmmA, xmmB, xmmC);
+ a.vaesenc(xmmA, xmmB, anyptr_gpC);
+ a.vaesenclast(xmmA, xmmB, xmmC);
+ a.vaesenclast(xmmA, xmmB, anyptr_gpC);
+ a.vaesimc(xmmA, xmmB);
+ a.vaesimc(xmmA, anyptr_gpB);
+ a.vaeskeygenassist(xmmA, xmmB, 0);
+ a.vaeskeygenassist(xmmA, anyptr_gpB, 0);
+
+ // AVX+PCLMULQDQ.
+ a.nop();
+
+ a.vpclmulqdq(xmmA, xmmB, xmmC, 0);
+ a.vpclmulqdq(xmmA, xmmB, anyptr_gpC, 0);
+
+ // AVX2.
+ a.nop();
+
+ a.vbroadcasti128(ymmA, anyptr_gpB);
+ a.vbroadcastsd(ymmA, xmmB);
+ a.vbroadcastss(xmmA, xmmB);
+ a.vbroadcastss(ymmA, xmmB);
+ a.vextracti128(xmmA, ymmB, 0);
+ a.vextracti128(anyptr_gpA, ymmB, 0);
+ a.vgatherdpd(xmmA, vx_ptr, xmmC);
+ a.vgatherdpd(ymmA, vx_ptr, ymmC);
+ a.vgatherdps(xmmA, vx_ptr, xmmC);
+ a.vgatherdps(ymmA, vy_ptr, ymmC);
+ a.vgatherqpd(xmmA, vx_ptr, xmmC);
+ a.vgatherqpd(ymmA, vy_ptr, ymmC);
+ a.vgatherqps(xmmA, vx_ptr, xmmC);
+ a.vgatherqps(xmmA, vy_ptr, xmmC);
+ a.vinserti128(ymmA, ymmB, xmmC, 0);
+ a.vinserti128(ymmA, ymmB, anyptr_gpC, 0);
+ a.vmovntdqa(ymmA, anyptr_gpB);
+ a.vmpsadbw(ymmA, ymmB, ymmC, 0);
+ a.vmpsadbw(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpabsb(ymmA, ymmB);
+ a.vpabsb(ymmA, anyptr_gpB);
+ a.vpabsd(ymmA, ymmB);
+ a.vpabsd(ymmA, anyptr_gpB);
+ a.vpabsw(ymmA, ymmB);
+ a.vpabsw(ymmA, anyptr_gpB);
+ a.vpackssdw(ymmA, ymmB, ymmC);
+ a.vpackssdw(ymmA, ymmB, anyptr_gpC);
+ a.vpacksswb(ymmA, ymmB, ymmC);
+ a.vpacksswb(ymmA, ymmB, anyptr_gpC);
+ a.vpackusdw(ymmA, ymmB, ymmC);
+ a.vpackusdw(ymmA, ymmB, anyptr_gpC);
+ a.vpackuswb(ymmA, ymmB, ymmC);
+ a.vpackuswb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddb(ymmA, ymmB, ymmC);
+ a.vpaddb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddd(ymmA, ymmB, ymmC);
+ a.vpaddd(ymmA, ymmB, anyptr_gpC);
+ a.vpaddq(ymmA, ymmB, ymmC);
+ a.vpaddq(ymmA, ymmB, anyptr_gpC);
+ a.vpaddw(ymmA, ymmB, ymmC);
+ a.vpaddw(ymmA, ymmB, anyptr_gpC);
+ a.vpaddsb(ymmA, ymmB, ymmC);
+ a.vpaddsb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddsw(ymmA, ymmB, ymmC);
+ a.vpaddsw(ymmA, ymmB, anyptr_gpC);
+ a.vpaddusb(ymmA, ymmB, ymmC);
+ a.vpaddusb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddusw(ymmA, ymmB, ymmC);
+ a.vpaddusw(ymmA, ymmB, anyptr_gpC);
+ a.vpalignr(ymmA, ymmB, ymmC, 0);
+ a.vpalignr(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpand(ymmA, ymmB, ymmC);
+ a.vpand(ymmA, ymmB, anyptr_gpC);
+ a.vpandn(ymmA, ymmB, ymmC);
+ a.vpandn(ymmA, ymmB, anyptr_gpC);
+ a.vpavgb(ymmA, ymmB, ymmC);
+ a.vpavgb(ymmA, ymmB, anyptr_gpC);
+ a.vpavgw(ymmA, ymmB, ymmC);
+ a.vpavgw(ymmA, ymmB, anyptr_gpC);
+ a.vpblendd(xmmA, xmmB, xmmC, 0);
+ a.vpblendd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpblendd(ymmA, ymmB, ymmC, 0);
+ a.vpblendd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpblendvb(ymmA, ymmB, ymmC, ymmD);
+ a.vpblendvb(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vpblendw(ymmA, ymmB, ymmC, 0);
+ a.vpblendw(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpbroadcastb(xmmA, xmmB);
+ a.vpbroadcastb(xmmA, anyptr_gpB);
+ a.vpbroadcastb(ymmA, xmmB);
+ a.vpbroadcastb(ymmA, anyptr_gpB);
+ a.vpbroadcastd(xmmA, xmmB);
+ a.vpbroadcastd(xmmA, anyptr_gpB);
+ a.vpbroadcastd(ymmA, xmmB);
+ a.vpbroadcastd(ymmA, anyptr_gpB);
+ a.vpbroadcastq(xmmA, xmmB);
+ a.vpbroadcastq(xmmA, anyptr_gpB);
+ a.vpbroadcastq(ymmA, xmmB);
+ a.vpbroadcastq(ymmA, anyptr_gpB);
+ a.vpbroadcastw(xmmA, xmmB);
+ a.vpbroadcastw(xmmA, anyptr_gpB);
+ a.vpbroadcastw(ymmA, xmmB);
+ a.vpbroadcastw(ymmA, anyptr_gpB);
+ a.vpcmpeqb(ymmA, ymmB, ymmC);
+ a.vpcmpeqb(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpeqd(ymmA, ymmB, ymmC);
+ a.vpcmpeqd(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpeqq(ymmA, ymmB, ymmC);
+ a.vpcmpeqq(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpeqw(ymmA, ymmB, ymmC);
+ a.vpcmpeqw(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpgtb(ymmA, ymmB, ymmC);
+ a.vpcmpgtb(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpgtd(ymmA, ymmB, ymmC);
+ a.vpcmpgtd(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpgtq(ymmA, ymmB, ymmC);
+ a.vpcmpgtq(ymmA, ymmB, anyptr_gpC);
+ a.vpcmpgtw(ymmA, ymmB, ymmC);
+ a.vpcmpgtw(ymmA, ymmB, anyptr_gpC);
+ a.vperm2i128(ymmA, ymmB, ymmC, 0);
+ a.vperm2i128(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpermd(ymmA, ymmB, ymmC);
+ a.vpermd(ymmA, ymmB, anyptr_gpC);
+ a.vpermps(ymmA, ymmB, ymmC);
+ a.vpermps(ymmA, ymmB, anyptr_gpC);
+ a.vpermpd(ymmA, ymmB, 0);
+ a.vpermpd(ymmA, anyptr_gpB, 0);
+ a.vpermq(ymmA, ymmB, 0);
+ a.vpermq(ymmA, anyptr_gpB, 0);
+ a.vpgatherdd(xmmA, vx_ptr, xmmC);
+ a.vpgatherdd(ymmA, vy_ptr, ymmC);
+ a.vpgatherdq(xmmA, vx_ptr, xmmC);
+ a.vpgatherdq(ymmA, vx_ptr, ymmC);
+ a.vpgatherqd(xmmA, vx_ptr, xmmC);
+ a.vpgatherqd(xmmA, vy_ptr, xmmC);
+ a.vpgatherqq(xmmA, vx_ptr, xmmC);
+ a.vpgatherqq(ymmA, vy_ptr, ymmC);
+ a.vpmovmskb(gzA, ymmB);
+ a.vpmovsxbd(ymmA, anyptr_gpB);
+ a.vpmovsxbd(ymmA, xmmB);
+ a.vpmovsxbq(ymmA, anyptr_gpB);
+ a.vpmovsxbq(ymmA, xmmB);
+ a.vpmovsxbw(ymmA, anyptr_gpB);
+ a.vpmovsxbw(ymmA, xmmB);
+ a.vpmovsxdq(ymmA, anyptr_gpB);
+ a.vpmovsxdq(ymmA, xmmB);
+ a.vpmovsxwd(ymmA, anyptr_gpB);
+ a.vpmovsxwd(ymmA, xmmB);
+ a.vpmovsxwq(ymmA, anyptr_gpB);
+ a.vpmovsxwq(ymmA, xmmB);
+ a.vpmovzxbd(ymmA, anyptr_gpB);
+ a.vpmovzxbd(ymmA, xmmB);
+ a.vpmovzxbq(ymmA, anyptr_gpB);
+ a.vpmovzxbq(ymmA, xmmB);
+ a.vpmovzxbw(ymmA, anyptr_gpB);
+ a.vpmovzxbw(ymmA, xmmB);
+ a.vpmovzxdq(ymmA, anyptr_gpB);
+ a.vpmovzxdq(ymmA, xmmB);
+ a.vpmovzxwd(ymmA, anyptr_gpB);
+ a.vpmovzxwd(ymmA, xmmB);
+ a.vpmovzxwq(ymmA, anyptr_gpB);
+ a.vpmovzxwq(ymmA, xmmB);
+ a.vpshufd(ymmA, anyptr_gpB, 0);
+ a.vpshufd(ymmA, ymmB, 0);
+ a.vpshufhw(ymmA, anyptr_gpB, 0);
+ a.vpshufhw(ymmA, ymmB, 0);
+ a.vpshuflw(ymmA, anyptr_gpB, 0);
+ a.vpshuflw(ymmA, ymmB, 0);
+ a.vpslld(ymmA, ymmB, 0);
+ a.vpslldq(ymmA, ymmB, 0);
+ a.vpsllq(ymmA, ymmB, 0);
+ a.vpsllw(ymmA, ymmB, 0);
+ a.vpsrad(ymmA, ymmB, 0);
+ a.vpsraw(ymmA, ymmB, 0);
+ a.vpsrld(ymmA, ymmB, 0);
+ a.vpsrldq(ymmA, ymmB, 0);
+ a.vpsrlq(ymmA, ymmB, 0);
+ a.vpsrlw(ymmA, ymmB, 0);
+ a.vphaddd(ymmA, ymmB, anyptr_gpC);
+ a.vphaddd(ymmA, ymmB, ymmC);
+ a.vphaddsw(ymmA, ymmB, anyptr_gpC);
+ a.vphaddsw(ymmA, ymmB, ymmC);
+ a.vphaddw(ymmA, ymmB, anyptr_gpC);
+ a.vphaddw(ymmA, ymmB, ymmC);
+ a.vphsubd(ymmA, ymmB, anyptr_gpC);
+ a.vphsubd(ymmA, ymmB, ymmC);
+ a.vphsubsw(ymmA, ymmB, anyptr_gpC);
+ a.vphsubsw(ymmA, ymmB, ymmC);
+ a.vphsubw(ymmA, ymmB, anyptr_gpC);
+ a.vphsubw(ymmA, ymmB, ymmC);
+ a.vpmaddubsw(ymmA, ymmB, anyptr_gpC);
+ a.vpmaddubsw(ymmA, ymmB, ymmC);
+ a.vpmaddwd(ymmA, ymmB, anyptr_gpC);
+ a.vpmaddwd(ymmA, ymmB, ymmC);
+ a.vpmaskmovd(anyptr_gpA, xmmB, xmmC);
+ a.vpmaskmovd(anyptr_gpA, ymmB, ymmC);
+ a.vpmaskmovd(xmmA, xmmB, anyptr_gpC);
+ a.vpmaskmovd(ymmA, ymmB, anyptr_gpC);
+ a.vpmaskmovq(anyptr_gpA, xmmB, xmmC);
+ a.vpmaskmovq(anyptr_gpA, ymmB, ymmC);
+ a.vpmaskmovq(xmmA, xmmB, anyptr_gpC);
+ a.vpmaskmovq(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsb(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsb(ymmA, ymmB, ymmC);
+ a.vpmaxsd(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsd(ymmA, ymmB, ymmC);
+ a.vpmaxsw(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsw(ymmA, ymmB, ymmC);
+ a.vpmaxub(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxub(ymmA, ymmB, ymmC);
+ a.vpmaxud(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxud(ymmA, ymmB, ymmC);
+ a.vpmaxuw(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxuw(ymmA, ymmB, ymmC);
+ a.vpminsb(ymmA, ymmB, anyptr_gpC);
+ a.vpminsb(ymmA, ymmB, ymmC);
+ a.vpminsd(ymmA, ymmB, anyptr_gpC);
+ a.vpminsd(ymmA, ymmB, ymmC);
+ a.vpminsw(ymmA, ymmB, anyptr_gpC);
+ a.vpminsw(ymmA, ymmB, ymmC);
+ a.vpminub(ymmA, ymmB, anyptr_gpC);
+ a.vpminub(ymmA, ymmB, ymmC);
+ a.vpminud(ymmA, ymmB, anyptr_gpC);
+ a.vpminud(ymmA, ymmB, ymmC);
+ a.vpminuw(ymmA, ymmB, anyptr_gpC);
+ a.vpminuw(ymmA, ymmB, ymmC);
+ a.vpmuldq(ymmA, ymmB, anyptr_gpC);
+ a.vpmuldq(ymmA, ymmB, ymmC);
+ a.vpmulhrsw(ymmA, ymmB, anyptr_gpC);
+ a.vpmulhrsw(ymmA, ymmB, ymmC);
+ a.vpmulhuw(ymmA, ymmB, anyptr_gpC);
+ a.vpmulhuw(ymmA, ymmB, ymmC);
+ a.vpmulhw(ymmA, ymmB, anyptr_gpC);
+ a.vpmulhw(ymmA, ymmB, ymmC);
+ a.vpmulld(ymmA, ymmB, anyptr_gpC);
+ a.vpmulld(ymmA, ymmB, ymmC);
+ a.vpmullw(ymmA, ymmB, anyptr_gpC);
+ a.vpmullw(ymmA, ymmB, ymmC);
+ a.vpmuludq(ymmA, ymmB, anyptr_gpC);
+ a.vpmuludq(ymmA, ymmB, ymmC);
+ a.vpor(ymmA, ymmB, anyptr_gpC);
+ a.vpor(ymmA, ymmB, ymmC);
+ a.vpsadbw(ymmA, ymmB, anyptr_gpC);
+ a.vpsadbw(ymmA, ymmB, ymmC);
+ a.vpshufb(ymmA, ymmB, anyptr_gpC);
+ a.vpshufb(ymmA, ymmB, ymmC);
+ a.vpsignb(ymmA, ymmB, anyptr_gpC);
+ a.vpsignb(ymmA, ymmB, ymmC);
+ a.vpsignd(ymmA, ymmB, anyptr_gpC);
+ a.vpsignd(ymmA, ymmB, ymmC);
+ a.vpsignw(ymmA, ymmB, anyptr_gpC);
+ a.vpsignw(ymmA, ymmB, ymmC);
+ a.vpslld(ymmA, ymmB, anyptr_gpC);
+ a.vpslld(ymmA, ymmB, xmmC);
+ a.vpsllq(ymmA, ymmB, anyptr_gpC);
+ a.vpsllq(ymmA, ymmB, xmmC);
+ a.vpsllvd(xmmA, xmmB, anyptr_gpC);
+ a.vpsllvd(xmmA, xmmB, xmmC);
+ a.vpsllvd(ymmA, ymmB, anyptr_gpC);
+ a.vpsllvd(ymmA, ymmB, ymmC);
+ a.vpsllvq(xmmA, xmmB, anyptr_gpC);
+ a.vpsllvq(xmmA, xmmB, xmmC);
+ a.vpsllvq(ymmA, ymmB, anyptr_gpC);
+ a.vpsllvq(ymmA, ymmB, ymmC);
+ a.vpsllw(ymmA, ymmB, anyptr_gpC);
+ a.vpsllw(ymmA, ymmB, xmmC);
+ a.vpsrad(ymmA, ymmB, anyptr_gpC);
+ a.vpsrad(ymmA, ymmB, xmmC);
+ a.vpsravd(xmmA, xmmB, anyptr_gpC);
+ a.vpsravd(xmmA, xmmB, xmmC);
+ a.vpsravd(ymmA, ymmB, anyptr_gpC);
+ a.vpsravd(ymmA, ymmB, ymmC);
+ a.vpsraw(ymmA, ymmB, anyptr_gpC);
+ a.vpsraw(ymmA, ymmB, xmmC);
+ a.vpsrld(ymmA, ymmB, anyptr_gpC);
+ a.vpsrld(ymmA, ymmB, xmmC);
+ a.vpsrlq(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlq(ymmA, ymmB, xmmC);
+ a.vpsrlvd(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlvd(xmmA, xmmB, xmmC);
+ a.vpsrlvd(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlvd(ymmA, ymmB, ymmC);
+ a.vpsrlvq(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlvq(xmmA, xmmB, xmmC);
+ a.vpsrlvq(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlvq(ymmA, ymmB, ymmC);
+ a.vpsrlw(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlw(ymmA, ymmB, xmmC);
+ a.vpsubb(ymmA, ymmB, anyptr_gpC);
+ a.vpsubb(ymmA, ymmB, ymmC);
+ a.vpsubd(ymmA, ymmB, anyptr_gpC);
+ a.vpsubd(ymmA, ymmB, ymmC);
+ a.vpsubq(ymmA, ymmB, anyptr_gpC);
+ a.vpsubq(ymmA, ymmB, ymmC);
+ a.vpsubsb(ymmA, ymmB, anyptr_gpC);
+ a.vpsubsb(ymmA, ymmB, ymmC);
+ a.vpsubsw(ymmA, ymmB, anyptr_gpC);
+ a.vpsubsw(ymmA, ymmB, ymmC);
+ a.vpsubusb(ymmA, ymmB, anyptr_gpC);
+ a.vpsubusb(ymmA, ymmB, ymmC);
+ a.vpsubusw(ymmA, ymmB, anyptr_gpC);
+ a.vpsubusw(ymmA, ymmB, ymmC);
+ a.vpsubw(ymmA, ymmB, anyptr_gpC);
+ a.vpsubw(ymmA, ymmB, ymmC);
+ a.vpunpckhbw(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhbw(ymmA, ymmB, ymmC);
+ a.vpunpckhdq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhdq(ymmA, ymmB, ymmC);
+ a.vpunpckhqdq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhqdq(ymmA, ymmB, ymmC);
+ a.vpunpckhwd(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhwd(ymmA, ymmB, ymmC);
+ a.vpunpcklbw(ymmA, ymmB, anyptr_gpC);
+ a.vpunpcklbw(ymmA, ymmB, ymmC);
+ a.vpunpckldq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckldq(ymmA, ymmB, ymmC);
+ a.vpunpcklqdq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpcklqdq(ymmA, ymmB, ymmC);
+ a.vpunpcklwd(ymmA, ymmB, anyptr_gpC);
+ a.vpunpcklwd(ymmA, ymmB, ymmC);
+ a.vpxor(ymmA, ymmB, anyptr_gpC);
+ a.vpxor(ymmA, ymmB, ymmC);
+
+ // FMA3.
+ a.nop();
+
+ a.vfmadd132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132pd(xmmA, xmmB, xmmC);
+ a.vfmadd132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd132pd(ymmA, ymmB, ymmC);
+ a.vfmadd132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132ps(xmmA, xmmB, xmmC);
+ a.vfmadd132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd132ps(ymmA, ymmB, ymmC);
+ a.vfmadd132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132sd(xmmA, xmmB, xmmC);
+ a.vfmadd132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132ss(xmmA, xmmB, xmmC);
+ a.vfmadd213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213pd(xmmA, xmmB, xmmC);
+ a.vfmadd213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd213pd(ymmA, ymmB, ymmC);
+ a.vfmadd213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213ps(xmmA, xmmB, xmmC);
+ a.vfmadd213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd213ps(ymmA, ymmB, ymmC);
+ a.vfmadd213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213sd(xmmA, xmmB, xmmC);
+ a.vfmadd213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213ss(xmmA, xmmB, xmmC);
+ a.vfmadd231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231pd(xmmA, xmmB, xmmC);
+ a.vfmadd231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd231pd(ymmA, ymmB, ymmC);
+ a.vfmadd231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231ps(xmmA, xmmB, xmmC);
+ a.vfmadd231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd231ps(ymmA, ymmB, ymmC);
+ a.vfmadd231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231sd(xmmA, xmmB, xmmC);
+ a.vfmadd231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231ss(xmmA, xmmB, xmmC);
+ a.vfmaddsub132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub132pd(xmmA, xmmB, xmmC);
+ a.vfmaddsub132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub132pd(ymmA, ymmB, ymmC);
+ a.vfmaddsub132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub132ps(xmmA, xmmB, xmmC);
+ a.vfmaddsub132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub132ps(ymmA, ymmB, ymmC);
+ a.vfmaddsub213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub213pd(xmmA, xmmB, xmmC);
+ a.vfmaddsub213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub213pd(ymmA, ymmB, ymmC);
+ a.vfmaddsub213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub213ps(xmmA, xmmB, xmmC);
+ a.vfmaddsub213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub213ps(ymmA, ymmB, ymmC);
+ a.vfmaddsub231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub231pd(xmmA, xmmB, xmmC);
+ a.vfmaddsub231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub231pd(ymmA, ymmB, ymmC);
+ a.vfmaddsub231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub231ps(xmmA, xmmB, xmmC);
+ a.vfmaddsub231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub231ps(ymmA, ymmB, ymmC);
+ a.vfmsub132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132pd(xmmA, xmmB, xmmC);
+ a.vfmsub132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub132pd(ymmA, ymmB, ymmC);
+ a.vfmsub132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132ps(xmmA, xmmB, xmmC);
+ a.vfmsub132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub132ps(ymmA, ymmB, ymmC);
+ a.vfmsub132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132sd(xmmA, xmmB, xmmC);
+ a.vfmsub132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132ss(xmmA, xmmB, xmmC);
+ a.vfmsub213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213pd(xmmA, xmmB, xmmC);
+ a.vfmsub213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub213pd(ymmA, ymmB, ymmC);
+ a.vfmsub213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213ps(xmmA, xmmB, xmmC);
+ a.vfmsub213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub213ps(ymmA, ymmB, ymmC);
+ a.vfmsub213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213sd(xmmA, xmmB, xmmC);
+ a.vfmsub213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213ss(xmmA, xmmB, xmmC);
+ a.vfmsub231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231pd(xmmA, xmmB, xmmC);
+ a.vfmsub231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub231pd(ymmA, ymmB, ymmC);
+ a.vfmsub231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231ps(xmmA, xmmB, xmmC);
+ a.vfmsub231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub231ps(ymmA, ymmB, ymmC);
+ a.vfmsub231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231sd(xmmA, xmmB, xmmC);
+ a.vfmsub231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231ss(xmmA, xmmB, xmmC);
+ a.vfmsubadd132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd132pd(xmmA, xmmB, xmmC);
+ a.vfmsubadd132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd132pd(ymmA, ymmB, ymmC);
+ a.vfmsubadd132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd132ps(xmmA, xmmB, xmmC);
+ a.vfmsubadd132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd132ps(ymmA, ymmB, ymmC);
+ a.vfmsubadd213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd213pd(xmmA, xmmB, xmmC);
+ a.vfmsubadd213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd213pd(ymmA, ymmB, ymmC);
+ a.vfmsubadd213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd213ps(xmmA, xmmB, xmmC);
+ a.vfmsubadd213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd213ps(ymmA, ymmB, ymmC);
+ a.vfmsubadd231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd231pd(xmmA, xmmB, xmmC);
+ a.vfmsubadd231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd231pd(ymmA, ymmB, ymmC);
+ a.vfmsubadd231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd231ps(xmmA, xmmB, xmmC);
+ a.vfmsubadd231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd231ps(ymmA, ymmB, ymmC);
+ a.vfnmadd132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132pd(xmmA, xmmB, xmmC);
+ a.vfnmadd132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd132pd(ymmA, ymmB, ymmC);
+ a.vfnmadd132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132ps(xmmA, xmmB, xmmC);
+ a.vfnmadd132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd132ps(ymmA, ymmB, ymmC);
+ a.vfnmadd132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132sd(xmmA, xmmB, xmmC);
+ a.vfnmadd132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132ss(xmmA, xmmB, xmmC);
+ a.vfnmadd213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213pd(xmmA, xmmB, xmmC);
+ a.vfnmadd213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd213pd(ymmA, ymmB, ymmC);
+ a.vfnmadd213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213ps(xmmA, xmmB, xmmC);
+ a.vfnmadd213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd213ps(ymmA, ymmB, ymmC);
+ a.vfnmadd213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213sd(xmmA, xmmB, xmmC);
+ a.vfnmadd213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213ss(xmmA, xmmB, xmmC);
+ a.vfnmadd231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231pd(xmmA, xmmB, xmmC);
+ a.vfnmadd231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd231pd(ymmA, ymmB, ymmC);
+ a.vfnmadd231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231ps(xmmA, xmmB, xmmC);
+ a.vfnmadd231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd231ps(ymmA, ymmB, ymmC);
+ a.vfnmadd231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231sd(xmmA, xmmB, xmmC);
+ a.vfnmadd231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231ss(xmmA, xmmB, xmmC);
+ a.vfnmsub132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132pd(xmmA, xmmB, xmmC);
+ a.vfnmsub132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub132pd(ymmA, ymmB, ymmC);
+ a.vfnmsub132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132ps(xmmA, xmmB, xmmC);
+ a.vfnmsub132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub132ps(ymmA, ymmB, ymmC);
+ a.vfnmsub132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132sd(xmmA, xmmB, xmmC);
+ a.vfnmsub132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132ss(xmmA, xmmB, xmmC);
+ a.vfnmsub213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213pd(xmmA, xmmB, xmmC);
+ a.vfnmsub213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub213pd(ymmA, ymmB, ymmC);
+ a.vfnmsub213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213ps(xmmA, xmmB, xmmC);
+ a.vfnmsub213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub213ps(ymmA, ymmB, ymmC);
+ a.vfnmsub213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213sd(xmmA, xmmB, xmmC);
+ a.vfnmsub213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213ss(xmmA, xmmB, xmmC);
+ a.vfnmsub231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231pd(xmmA, xmmB, xmmC);
+ a.vfnmsub231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub231pd(ymmA, ymmB, ymmC);
+ a.vfnmsub231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231ps(xmmA, xmmB, xmmC);
+ a.vfnmsub231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub231ps(ymmA, ymmB, ymmC);
+ a.vfnmsub231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231sd(xmmA, xmmB, xmmC);
+ a.vfnmsub231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231ss(xmmA, xmmB, xmmC);
+
+ // FMA4.
+ a.nop();
+
+ a.vfmaddpd(xmmA, xmmB, xmmC, xmmD);
+ a.vfmaddpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmaddpd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmaddpd(ymmA, ymmB, ymmC, ymmD);
+ a.vfmaddpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmaddpd(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmaddps(xmmA, xmmB, xmmC, xmmD);
+ a.vfmaddps(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmaddps(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmaddps(ymmA, ymmB, ymmC, ymmD);
+ a.vfmaddps(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmaddps(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmaddsd(xmmA, xmmB, xmmC, xmmD);
+ a.vfmaddsd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmaddsd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmaddss(xmmA, xmmB, xmmC, xmmD);
+ a.vfmaddss(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmaddss(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmaddsubpd(xmmA, xmmB, xmmC, xmmD);
+ a.vfmaddsubpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmaddsubpd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmaddsubpd(ymmA, ymmB, ymmC, ymmD);
+ a.vfmaddsubpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmaddsubpd(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmaddsubps(xmmA, xmmB, xmmC, xmmD);
+ a.vfmaddsubps(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmaddsubps(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmaddsubps(ymmA, ymmB, ymmC, ymmD);
+ a.vfmaddsubps(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmaddsubps(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmsubaddpd(xmmA, xmmB, xmmC, xmmD);
+ a.vfmsubaddpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmsubaddpd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmsubaddpd(ymmA, ymmB, ymmC, ymmD);
+ a.vfmsubaddpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmsubaddpd(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmsubaddps(xmmA, xmmB, xmmC, xmmD);
+ a.vfmsubaddps(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmsubaddps(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmsubaddps(ymmA, ymmB, ymmC, ymmD);
+ a.vfmsubaddps(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmsubaddps(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmsubpd(xmmA, xmmB, xmmC, xmmD);
+ a.vfmsubpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmsubpd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmsubpd(ymmA, ymmB, ymmC, ymmD);
+ a.vfmsubpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmsubpd(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmsubps(xmmA, xmmB, xmmC, xmmD);
+ a.vfmsubps(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmsubps(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmsubps(ymmA, ymmB, ymmC, ymmD);
+ a.vfmsubps(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfmsubps(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfmsubsd(xmmA, xmmB, xmmC, xmmD);
+ a.vfmsubsd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmsubsd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfmsubss(xmmA, xmmB, xmmC, xmmD);
+ a.vfmsubss(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfmsubss(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmaddpd(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmaddpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmaddpd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmaddpd(ymmA, ymmB, ymmC, ymmD);
+ a.vfnmaddpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfnmaddpd(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfnmaddps(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmaddps(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmaddps(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmaddps(ymmA, ymmB, ymmC, ymmD);
+ a.vfnmaddps(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfnmaddps(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfnmaddsd(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmaddsd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmaddsd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmaddss(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmaddss(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmaddss(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmsubpd(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmsubpd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmsubpd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmsubpd(ymmA, ymmB, ymmC, ymmD);
+ a.vfnmsubpd(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfnmsubpd(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfnmsubps(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmsubps(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmsubps(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmsubps(ymmA, ymmB, ymmC, ymmD);
+ a.vfnmsubps(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vfnmsubps(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vfnmsubsd(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmsubsd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmsubsd(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vfnmsubss(xmmA, xmmB, xmmC, xmmD);
+ a.vfnmsubss(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vfnmsubss(xmmA, xmmB, xmmC, anyptr_gpD);
+
+ // XOP.
+ a.nop();
+
+ a.vfrczpd(xmmA, xmmB);
+ a.vfrczpd(xmmA, anyptr_gpB);
+ a.vfrczpd(ymmA, ymmB);
+ a.vfrczpd(ymmA, anyptr_gpB);
+ a.vfrczps(xmmA, xmmB);
+ a.vfrczps(xmmA, anyptr_gpB);
+ a.vfrczps(ymmA, ymmB);
+ a.vfrczps(ymmA, anyptr_gpB);
+ a.vfrczsd(xmmA, xmmB);
+ a.vfrczsd(xmmA, anyptr_gpB);
+ a.vfrczss(xmmA, xmmB);
+ a.vfrczss(xmmA, anyptr_gpB);
+ a.vpcmov(xmmA, xmmB, xmmC, xmmD);
+ a.vpcmov(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpcmov(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vpcmov(ymmA, ymmB, ymmC, ymmD);
+ a.vpcmov(ymmA, ymmB, anyptr_gpC, ymmD);
+ a.vpcmov(ymmA, ymmB, ymmC, anyptr_gpD);
+ a.vpcomb(xmmA, xmmB, xmmC, 0);
+ a.vpcomb(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomd(xmmA, xmmB, xmmC, 0);
+ a.vpcomd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomq(xmmA, xmmB, xmmC, 0);
+ a.vpcomq(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomw(xmmA, xmmB, xmmC, 0);
+ a.vpcomw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomub(xmmA, xmmB, xmmC, 0);
+ a.vpcomub(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomud(xmmA, xmmB, xmmC, 0);
+ a.vpcomud(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomuq(xmmA, xmmB, xmmC, 0);
+ a.vpcomuq(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpcomuw(xmmA, xmmB, xmmC, 0);
+ a.vpcomuw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpermil2pd(xmmA, xmmB, xmmC, xmmD, 0);
+ a.vpermil2pd(xmmA, xmmB, anyptr_gpC, xmmD, 0);
+ a.vpermil2pd(xmmA, xmmB, xmmC, anyptr_gpD, 0);
+ a.vpermil2pd(ymmA, ymmB, ymmC, ymmD, 0);
+ a.vpermil2pd(ymmA, ymmB, anyptr_gpC, ymmD, 0);
+ a.vpermil2pd(ymmA, ymmB, ymmC, anyptr_gpD, 0);
+ a.vpermil2ps(xmmA, xmmB, xmmC, xmmD, 0);
+ a.vpermil2ps(xmmA, xmmB, anyptr_gpC, xmmD, 0);
+ a.vpermil2ps(xmmA, xmmB, xmmC, anyptr_gpD, 0);
+ a.vpermil2ps(ymmA, ymmB, ymmC, ymmD, 0);
+ a.vpermil2ps(ymmA, ymmB, anyptr_gpC, ymmD, 0);
+ a.vpermil2ps(ymmA, ymmB, ymmC, anyptr_gpD, 0);
+ a.vphaddbd(xmmA, xmmB);
+ a.vphaddbd(xmmA, anyptr_gpB);
+ a.vphaddbq(xmmA, xmmB);
+ a.vphaddbq(xmmA, anyptr_gpB);
+ a.vphaddbw(xmmA, xmmB);
+ a.vphaddbw(xmmA, anyptr_gpB);
+ a.vphadddq(xmmA, xmmB);
+ a.vphadddq(xmmA, anyptr_gpB);
+ a.vphaddwd(xmmA, xmmB);
+ a.vphaddwd(xmmA, anyptr_gpB);
+ a.vphaddwq(xmmA, xmmB);
+ a.vphaddwq(xmmA, anyptr_gpB);
+ a.vphaddubd(xmmA, xmmB);
+ a.vphaddubd(xmmA, anyptr_gpB);
+ a.vphaddubq(xmmA, xmmB);
+ a.vphaddubq(xmmA, anyptr_gpB);
+ a.vphaddubw(xmmA, xmmB);
+ a.vphaddubw(xmmA, anyptr_gpB);
+ a.vphaddudq(xmmA, xmmB);
+ a.vphaddudq(xmmA, anyptr_gpB);
+ a.vphadduwd(xmmA, xmmB);
+ a.vphadduwd(xmmA, anyptr_gpB);
+ a.vphadduwq(xmmA, xmmB);
+ a.vphadduwq(xmmA, anyptr_gpB);
+ a.vphsubbw(xmmA, xmmB);
+ a.vphsubbw(xmmA, anyptr_gpB);
+ a.vphsubdq(xmmA, xmmB);
+ a.vphsubdq(xmmA, anyptr_gpB);
+ a.vphsubwd(xmmA, xmmB);
+ a.vphsubwd(xmmA, anyptr_gpB);
+ a.vpmacsdd(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacsdd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacsdqh(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacsdqh(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacsdql(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacsdql(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacswd(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacswd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacsww(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacsww(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacssdd(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacssdd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacssdqh(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacssdqh(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacssdql(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacssdql(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacsswd(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacsswd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmacssww(xmmA, xmmB, xmmC, xmmD);
+ a.vpmacssww(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmadcsswd(xmmA, xmmB, xmmC, xmmD);
+ a.vpmadcsswd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpmadcswd(xmmA, xmmB, xmmC, xmmD);
+ a.vpmadcswd(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpperm(xmmA, xmmB, xmmC, xmmD);
+ a.vpperm(xmmA, xmmB, anyptr_gpC, xmmD);
+ a.vpperm(xmmA, xmmB, xmmC, anyptr_gpD);
+ a.vprotb(xmmA, xmmB, xmmC);
+ a.vprotb(xmmA, anyptr_gpB, xmmC);
+ a.vprotb(xmmA, xmmB, anyptr_gpC);
+ a.vprotb(xmmA, xmmB, 0);
+ a.vprotb(xmmA, anyptr_gpB, 0);
+ a.vprotd(xmmA, xmmB, xmmC);
+ a.vprotd(xmmA, anyptr_gpB, xmmC);
+ a.vprotd(xmmA, xmmB, anyptr_gpC);
+ a.vprotd(xmmA, xmmB, 0);
+ a.vprotd(xmmA, anyptr_gpB, 0);
+ a.vprotq(xmmA, xmmB, xmmC);
+ a.vprotq(xmmA, anyptr_gpB, xmmC);
+ a.vprotq(xmmA, xmmB, anyptr_gpC);
+ a.vprotq(xmmA, xmmB, 0);
+ a.vprotq(xmmA, anyptr_gpB, 0);
+ a.vprotw(xmmA, xmmB, xmmC);
+ a.vprotw(xmmA, anyptr_gpB, xmmC);
+ a.vprotw(xmmA, xmmB, anyptr_gpC);
+ a.vprotw(xmmA, xmmB, 0);
+ a.vprotw(xmmA, anyptr_gpB, 0);
+ a.vpshab(xmmA, xmmB, xmmC);
+ a.vpshab(xmmA, anyptr_gpB, xmmC);
+ a.vpshab(xmmA, xmmB, anyptr_gpC);
+ a.vpshad(xmmA, xmmB, xmmC);
+ a.vpshad(xmmA, anyptr_gpB, xmmC);
+ a.vpshad(xmmA, xmmB, anyptr_gpC);
+ a.vpshaq(xmmA, xmmB, xmmC);
+ a.vpshaq(xmmA, anyptr_gpB, xmmC);
+ a.vpshaq(xmmA, xmmB, anyptr_gpC);
+ a.vpshaw(xmmA, xmmB, xmmC);
+ a.vpshaw(xmmA, anyptr_gpB, xmmC);
+ a.vpshaw(xmmA, xmmB, anyptr_gpC);
+ a.vpshlb(xmmA, xmmB, xmmC);
+ a.vpshlb(xmmA, anyptr_gpB, xmmC);
+ a.vpshlb(xmmA, xmmB, anyptr_gpC);
+ a.vpshld(xmmA, xmmB, xmmC);
+ a.vpshld(xmmA, anyptr_gpB, xmmC);
+ a.vpshld(xmmA, xmmB, anyptr_gpC);
+ a.vpshlq(xmmA, xmmB, xmmC);
+ a.vpshlq(xmmA, anyptr_gpB, xmmC);
+ a.vpshlq(xmmA, xmmB, anyptr_gpC);
+ a.vpshlw(xmmA, xmmB, xmmC);
+ a.vpshlw(xmmA, anyptr_gpB, xmmC);
+ a.vpshlw(xmmA, xmmB, anyptr_gpC);
+
+ // F16C.
+ a.nop();
+
+ a.vcvtph2ps(xmmA, xmmB);
+ a.vcvtph2ps(xmmA, anyptr_gpB);
+ a.vcvtph2ps(ymmA, xmmB);
+ a.vcvtph2ps(ymmA, anyptr_gpB);
+ a.vcvtps2ph(xmmA, xmmB, 0);
+ a.vcvtps2ph(anyptr_gpA, xmmB, 0);
+ a.vcvtps2ph(xmmA, ymmB, 0);
+ a.vcvtps2ph(anyptr_gpA, ymmB, 0);
+
+ // AVX512.
+ a.nop();
+
+ a.kaddb(kA, kB, kC);
+ a.kaddd(kA, kB, kC);
+ a.kaddq(kA, kB, kC);
+ a.kaddw(kA, kB, kC);
+ a.kandb(kA, kB, kC);
+ a.kandd(kA, kB, kC);
+ a.kandnb(kA, kB, kC);
+ a.kandnd(kA, kB, kC);
+ a.kandnq(kA, kB, kC);
+ a.kandnw(kA, kB, kC);
+ a.kandq(kA, kB, kC);
+ a.kandw(kA, kB, kC);
+ a.kmovb(kA, kB);
+ a.kmovb(kA, anyptr_gpB);
+ a.kmovb(kA, gdB);
+ if (isX64) a.kmovb(kA, gzB);
+ a.kmovb(anyptr_gpA, kB);
+ a.kmovb(gdA, kB);
+ if (isX64) a.kmovb(gzA, kB);
+ a.kmovd(kA, kB);
+ a.kmovd(kA, anyptr_gpB);
+ a.kmovd(kA, gdB);
+ if (isX64) a.kmovd(kA, gzB);
+ a.kmovd(anyptr_gpA, kB);
+ a.kmovd(gdA, kB);
+ if (isX64) a.kmovd(gzA, kB);
+ a.kmovq(kA, kB);
+ a.kmovq(kA, anyptr_gpB);
+ if (isX64) a.kmovq(kA, gzB);
+ a.kmovq(anyptr_gpA, kB);
+ if (isX64) a.kmovq(gzA, kB);
+ a.kmovw(kA, kB);
+ a.kmovw(kA, anyptr_gpB);
+ a.kmovw(kA, gdB);
+ if (isX64) a.kmovw(kA, gzB);
+ a.kmovw(anyptr_gpA, kB);
+ a.kmovw(gdA, kB);
+ if (isX64) a.kmovw(gzA, kB);
+ a.knotb(kA, kB);
+ a.knotd(kA, kB);
+ a.knotq(kA, kB);
+ a.knotw(kA, kB);
+ a.korb(kA, kB, kC);
+ a.kord(kA, kB, kC);
+ a.korq(kA, kB, kC);
+ a.kortestb(kA, kB);
+ a.kortestd(kA, kB);
+ a.kortestq(kA, kB);
+ a.kortestw(kA, kB);
+ a.korw(kA, kB, kC);
+ a.kshiftlb(kA, kB, 0);
+ a.kshiftld(kA, kB, 0);
+ a.kshiftlq(kA, kB, 0);
+ a.kshiftlw(kA, kB, 0);
+ a.kshiftrb(kA, kB, 0);
+ a.kshiftrd(kA, kB, 0);
+ a.kshiftrq(kA, kB, 0);
+ a.kshiftrw(kA, kB, 0);
+ a.ktestb(kA, kB);
+ a.ktestd(kA, kB);
+ a.ktestq(kA, kB);
+ a.ktestw(kA, kB);
+ a.kunpckbw(kA, kB, kC);
+ a.kunpckdq(kA, kB, kC);
+ a.kunpckwd(kA, kB, kC);
+ a.kxnorb(kA, kB, kC);
+ a.kxnord(kA, kB, kC);
+ a.kxnorq(kA, kB, kC);
+ a.kxnorw(kA, kB, kC);
+ a.kxorb(kA, kB, kC);
+ a.kxord(kA, kB, kC);
+ a.kxorq(kA, kB, kC);
+ a.kxorw(kA, kB, kC);
+ a.nop();
+
+ a.vaddpd(xmmA, xmmB, xmmC);
+ a.vaddpd(xmmA, xmmB, anyptr_gpC);
+ a.vaddpd(ymmA, ymmB, ymmC);
+ a.vaddpd(ymmA, ymmB, anyptr_gpC);
+ a.vaddpd(zmmA, zmmB, zmmC);
+ a.vaddpd(zmmA, zmmB, anyptr_gpC);
+ a.vaddps(xmmA, xmmB, xmmC);
+ a.vaddps(xmmA, xmmB, anyptr_gpC);
+ a.vaddps(ymmA, ymmB, ymmC);
+ a.vaddps(ymmA, ymmB, anyptr_gpC);
+ a.vaddps(zmmA, zmmB, zmmC);
+ a.vaddps(zmmA, zmmB, anyptr_gpC);
+ a.vaddsd(xmmA, xmmB, xmmC);
+ a.vaddsd(xmmA, xmmB, anyptr_gpC);
+ a.vaddss(xmmA, xmmB, xmmC);
+ a.vaddss(xmmA, xmmB, anyptr_gpC);
+ a.valignd(xmmA, xmmB, xmmC, 0);
+ a.valignd(xmmA, xmmB, anyptr_gpC, 0);
+ a.valignd(ymmA, ymmB, ymmC, 0);
+ a.valignd(ymmA, ymmB, anyptr_gpC, 0);
+ a.valignd(zmmA, zmmB, zmmC, 0);
+ a.valignd(zmmA, zmmB, anyptr_gpC, 0);
+ a.valignq(xmmA, xmmB, xmmC, 0);
+ a.valignq(xmmA, xmmB, anyptr_gpC, 0);
+ a.valignq(ymmA, ymmB, ymmC, 0);
+ a.valignq(ymmA, ymmB, anyptr_gpC, 0);
+ a.valignq(zmmA, zmmB, zmmC, 0);
+ a.valignq(zmmA, zmmB, anyptr_gpC, 0);
+ a.vandnpd(xmmA, xmmB, xmmC);
+ a.vandnpd(xmmA, xmmB, anyptr_gpC);
+ a.vandnpd(ymmA, ymmB, ymmC);
+ a.vandnpd(ymmA, ymmB, anyptr_gpC);
+ a.vandnpd(zmmA, zmmB, zmmC);
+ a.vandnpd(zmmA, zmmB, anyptr_gpC);
+ a.vandnps(xmmA, xmmB, xmmC);
+ a.vandnps(xmmA, xmmB, anyptr_gpC);
+ a.vandnps(ymmA, ymmB, ymmC);
+ a.vandnps(ymmA, ymmB, anyptr_gpC);
+ a.vandnps(zmmA, zmmB, zmmC);
+ a.vandnps(zmmA, zmmB, anyptr_gpC);
+ a.vandpd(xmmA, xmmB, xmmC);
+ a.vandpd(xmmA, xmmB, anyptr_gpC);
+ a.vandpd(ymmA, ymmB, ymmC);
+ a.vandpd(ymmA, ymmB, anyptr_gpC);
+ a.vandpd(zmmA, zmmB, zmmC);
+ a.vandpd(zmmA, zmmB, anyptr_gpC);
+ a.vandps(xmmA, xmmB, xmmC);
+ a.vandps(xmmA, xmmB, anyptr_gpC);
+ a.vandps(ymmA, ymmB, ymmC);
+ a.vandps(ymmA, ymmB, anyptr_gpC);
+ a.vandps(zmmA, zmmB, zmmC);
+ a.vandps(zmmA, zmmB, anyptr_gpC);
+ a.vblendmb(xmmA, xmmB, xmmC);
+ a.vblendmb(xmmA, xmmB, anyptr_gpC);
+ a.vblendmb(ymmA, ymmB, ymmC);
+ a.vblendmb(ymmA, ymmB, anyptr_gpC);
+ a.vblendmb(zmmA, zmmB, zmmC);
+ a.vblendmb(zmmA, zmmB, anyptr_gpC);
+ a.vblendmd(xmmA, xmmB, xmmC);
+ a.vblendmd(xmmA, xmmB, anyptr_gpC);
+ a.vblendmd(ymmA, ymmB, ymmC);
+ a.vblendmd(ymmA, ymmB, anyptr_gpC);
+ a.vblendmd(zmmA, zmmB, zmmC);
+ a.vblendmd(zmmA, zmmB, anyptr_gpC);
+ a.vblendmpd(xmmA, xmmB, xmmC);
+ a.vblendmpd(xmmA, xmmB, anyptr_gpC);
+ a.vblendmpd(ymmA, ymmB, ymmC);
+ a.vblendmpd(ymmA, ymmB, anyptr_gpC);
+ a.vblendmpd(zmmA, zmmB, zmmC);
+ a.vblendmpd(zmmA, zmmB, anyptr_gpC);
+ a.vblendmps(xmmA, xmmB, xmmC);
+ a.vblendmps(xmmA, xmmB, anyptr_gpC);
+ a.vblendmps(ymmA, ymmB, ymmC);
+ a.vblendmps(ymmA, ymmB, anyptr_gpC);
+ a.vblendmps(zmmA, zmmB, zmmC);
+ a.vblendmps(zmmA, zmmB, anyptr_gpC);
+ a.vblendmq(xmmA, xmmB, xmmC);
+ a.vblendmq(xmmA, xmmB, anyptr_gpC);
+ a.vblendmq(ymmA, ymmB, ymmC);
+ a.vblendmq(ymmA, ymmB, anyptr_gpC);
+ a.vblendmq(zmmA, zmmB, zmmC);
+ a.vblendmq(zmmA, zmmB, anyptr_gpC);
+ a.vblendmw(xmmA, xmmB, xmmC);
+ a.vblendmw(xmmA, xmmB, anyptr_gpC);
+ a.vblendmw(ymmA, ymmB, ymmC);
+ a.vblendmw(ymmA, ymmB, anyptr_gpC);
+ a.vblendmw(zmmA, zmmB, zmmC);
+ a.vblendmw(zmmA, zmmB, anyptr_gpC);
+ a.vbroadcastf32x2(ymmA, xmmB);
+ a.vbroadcastf32x2(ymmA, anyptr_gpB);
+ a.vbroadcastf32x2(zmmA, xmmB);
+ a.vbroadcastf32x2(zmmA, anyptr_gpB);
+ a.vbroadcastf32x4(ymmA, anyptr_gpB);
+ a.vbroadcastf32x4(zmmA, anyptr_gpB);
+ a.vbroadcastf32x8(zmmA, anyptr_gpB);
+ a.vbroadcastf64x2(ymmA, anyptr_gpB);
+ a.vbroadcastf64x2(zmmA, anyptr_gpB);
+ a.vbroadcastf64x4(zmmA, anyptr_gpB);
+ a.vbroadcasti32x2(xmmA, xmmB);
+ a.vbroadcasti32x2(xmmA, anyptr_gpB);
+ a.vbroadcasti32x2(ymmA, xmmB);
+ a.vbroadcasti32x2(ymmA, anyptr_gpB);
+ a.vbroadcasti32x2(zmmA, xmmB);
+ a.vbroadcasti32x2(zmmA, anyptr_gpB);
+ a.vbroadcasti32x4(ymmA, xmmB);
+ a.vbroadcasti32x4(ymmA, anyptr_gpB);
+ a.vbroadcasti32x4(zmmA, xmmB);
+ a.vbroadcasti32x4(zmmA, anyptr_gpB);
+ a.vbroadcasti32x8(zmmA, xmmB);
+ a.vbroadcasti32x8(zmmA, anyptr_gpB);
+ a.vbroadcasti64x2(ymmA, xmmB);
+ a.vbroadcasti64x2(ymmA, anyptr_gpB);
+ a.vbroadcasti64x2(zmmA, xmmB);
+ a.vbroadcasti64x2(zmmA, anyptr_gpB);
+ a.vbroadcasti64x4(zmmA, xmmB);
+ a.vbroadcasti64x4(zmmA, anyptr_gpB);
+ a.vbroadcastsd(ymmA, xmmB);
+ a.vbroadcastsd(ymmA, anyptr_gpB);
+ a.vbroadcastsd(zmmA, xmmB);
+ a.vbroadcastsd(zmmA, anyptr_gpB);
+ a.vbroadcastss(xmmA, xmmB);
+ a.vbroadcastss(xmmA, anyptr_gpB);
+ a.vbroadcastss(ymmA, xmmB);
+ a.vbroadcastss(ymmA, anyptr_gpB);
+ a.vbroadcastss(zmmA, xmmB);
+ a.vbroadcastss(zmmA, anyptr_gpB);
+ a.vcmppd(kA, xmmB, xmmC, 0);
+ a.vcmppd(kA, xmmB, anyptr_gpC, 0);
+ a.vcmppd(kA, ymmB, ymmC, 0);
+ a.vcmppd(kA, ymmB, anyptr_gpC, 0);
+ a.vcmppd(kA, zmmB, zmmC, 0);
+ a.vcmppd(kA, zmmB, anyptr_gpC, 0);
+ a.vcmpps(kA, xmmB, xmmC, 0);
+ a.vcmpps(kA, xmmB, anyptr_gpC, 0);
+ a.vcmpps(kA, ymmB, ymmC, 0);
+ a.vcmpps(kA, ymmB, anyptr_gpC, 0);
+ a.vcmpps(kA, zmmB, zmmC, 0);
+ a.vcmpps(kA, zmmB, anyptr_gpC, 0);
+ a.vcmpsd(kA, xmmB, xmmC, 0);
+ a.vcmpsd(kA, xmmB, anyptr_gpC, 0);
+ a.vcmpss(kA, xmmB, xmmC, 0);
+ a.vcmpss(kA, xmmB, anyptr_gpC, 0);
+ a.vcomisd(xmmA, xmmB);
+ a.vcomisd(xmmA, anyptr_gpB);
+ a.vcomiss(xmmA, xmmB);
+ a.vcomiss(xmmA, anyptr_gpB);
+ a.vcompresspd(xmmA, xmmB);
+ a.vcompresspd(anyptr_gpA, xmmB);
+ a.vcompresspd(ymmA, ymmB);
+ a.vcompresspd(anyptr_gpA, ymmB);
+ a.vcompresspd(zmmA, zmmB);
+ a.vcompresspd(anyptr_gpA, zmmB);
+ a.vcompressps(xmmA, xmmB);
+ a.vcompressps(anyptr_gpA, xmmB);
+ a.vcompressps(ymmA, ymmB);
+ a.vcompressps(anyptr_gpA, ymmB);
+ a.vcompressps(zmmA, zmmB);
+ a.vcompressps(anyptr_gpA, zmmB);
+ a.vcvtdq2pd(xmmA, xmmB);
+ a.vcvtdq2pd(xmmA, anyptr_gpB);
+ a.vcvtdq2pd(ymmA, xmmB);
+ a.vcvtdq2pd(ymmA, anyptr_gpB);
+ a.vcvtdq2pd(zmmA, ymmB);
+ a.vcvtdq2pd(zmmA, anyptr_gpB);
+ a.vcvtdq2ps(xmmA, xmmB);
+ a.vcvtdq2ps(xmmA, anyptr_gpB);
+ a.vcvtdq2ps(ymmA, ymmB);
+ a.vcvtdq2ps(ymmA, anyptr_gpB);
+ a.vcvtdq2ps(zmmA, zmmB);
+ a.vcvtdq2ps(zmmA, anyptr_gpB);
+ a.vcvtpd2dq(xmmA, xmmB);
+ a.vcvtpd2dq(xmmA, anyptr_gpB);
+ a.vcvtpd2dq(xmmA, ymmB);
+ a.vcvtpd2dq(xmmA, anyptr_gpB);
+ a.vcvtpd2dq(ymmA, zmmB);
+ a.vcvtpd2dq(ymmA, anyptr_gpB);
+ a.vcvtpd2qq(xmmA, xmmB);
+ a.vcvtpd2qq(xmmA, anyptr_gpB);
+ a.vcvtpd2qq(ymmA, ymmB);
+ a.vcvtpd2qq(ymmA, anyptr_gpB);
+ a.vcvtpd2qq(zmmA, zmmB);
+ a.vcvtpd2qq(zmmA, anyptr_gpB);
+ a.vcvtpd2udq(xmmA, xmmB);
+ a.vcvtpd2udq(xmmA, anyptr_gpB);
+ a.vcvtpd2udq(xmmA, ymmB);
+ a.vcvtpd2udq(xmmA, anyptr_gpB);
+ a.vcvtpd2udq(ymmA, zmmB);
+ a.vcvtpd2udq(ymmA, anyptr_gpB);
+ a.vcvtpd2uqq(xmmA, xmmB);
+ a.vcvtpd2uqq(xmmA, anyptr_gpB);
+ a.vcvtpd2uqq(ymmA, ymmB);
+ a.vcvtpd2uqq(ymmA, anyptr_gpB);
+ a.vcvtpd2uqq(zmmA, zmmB);
+ a.vcvtpd2uqq(zmmA, anyptr_gpB);
+ a.vcvtph2ps(xmmA, xmmB);
+ a.vcvtph2ps(xmmA, anyptr_gpB);
+ a.vcvtph2ps(ymmA, xmmB);
+ a.vcvtph2ps(ymmA, anyptr_gpB);
+ a.vcvtph2ps(zmmA, ymmB);
+ a.vcvtph2ps(zmmA, anyptr_gpB);
+ a.vcvtps2dq(xmmA, xmmB);
+ a.vcvtps2dq(xmmA, anyptr_gpB);
+ a.vcvtps2dq(ymmA, ymmB);
+ a.vcvtps2dq(ymmA, anyptr_gpB);
+ a.vcvtps2dq(zmmA, zmmB);
+ a.vcvtps2dq(zmmA, anyptr_gpB);
+ a.vcvtps2pd(xmmA, xmmB);
+ a.vcvtps2pd(xmmA, anyptr_gpB);
+ a.vcvtps2pd(ymmA, xmmB);
+ a.vcvtps2pd(ymmA, anyptr_gpB);
+ a.vcvtps2pd(zmmA, ymmB);
+ a.vcvtps2pd(zmmA, anyptr_gpB);
+ a.vcvtps2ph(xmmA, xmmB, 0);
+ a.vcvtps2ph(anyptr_gpA, xmmB, 0);
+ a.vcvtps2ph(xmmA, ymmB, 0);
+ a.vcvtps2ph(anyptr_gpA, ymmB, 0);
+ a.vcvtps2ph(ymmA, zmmB, 0);
+ a.vcvtps2ph(anyptr_gpA, zmmB, 0);
+ a.vcvtps2qq(xmmA, xmmB);
+ a.vcvtps2qq(xmmA, anyptr_gpB);
+ a.vcvtps2qq(ymmA, xmmB);
+ a.vcvtps2qq(ymmA, anyptr_gpB);
+ a.vcvtps2qq(zmmA, ymmB);
+ a.vcvtps2qq(zmmA, anyptr_gpB);
+ a.vcvtps2udq(xmmA, xmmB);
+ a.vcvtps2udq(xmmA, anyptr_gpB);
+ a.vcvtps2udq(ymmA, ymmB);
+ a.vcvtps2udq(ymmA, anyptr_gpB);
+ a.vcvtps2udq(zmmA, zmmB);
+ a.vcvtps2udq(zmmA, anyptr_gpB);
+ a.vcvtps2uqq(xmmA, xmmB);
+ a.vcvtps2uqq(xmmA, anyptr_gpB);
+ a.vcvtps2uqq(ymmA, xmmB);
+ a.vcvtps2uqq(ymmA, anyptr_gpB);
+ a.vcvtps2uqq(zmmA, ymmB);
+ a.vcvtps2uqq(zmmA, anyptr_gpB);
+ a.vcvtqq2pd(xmmA, xmmB);
+ a.vcvtqq2pd(xmmA, anyptr_gpB);
+ a.vcvtqq2pd(ymmA, ymmB);
+ a.vcvtqq2pd(ymmA, anyptr_gpB);
+ a.vcvtqq2pd(zmmA, zmmB);
+ a.vcvtqq2pd(zmmA, anyptr_gpB);
+ a.vcvtqq2ps(xmmA, xmmB);
+ a.vcvtqq2ps(xmmA, anyptr_gpB);
+ a.vcvtqq2ps(xmmA, ymmB);
+ a.vcvtqq2ps(xmmA, anyptr_gpB);
+ a.vcvtqq2ps(ymmA, zmmB);
+ a.vcvtqq2ps(ymmA, anyptr_gpB);
+ a.vcvtsd2si(gdA, xmmB);
+ a.vcvtsd2si(gdA, anyptr_gpB);
+ if (isX64) a.vcvtsd2si(gzA, xmmB);
+ if (isX64) a.vcvtsd2si(gzA, anyptr_gpB);
+ a.vcvtsd2ss(xmmA, xmmB, xmmC);
+ a.vcvtsd2ss(xmmA, xmmB, anyptr_gpC);
+ a.vcvtsd2usi(gdA, xmmB);
+ a.vcvtsd2usi(gdA, anyptr_gpB);
+ if (isX64) a.vcvtsd2usi(gzA, xmmB);
+ if (isX64) a.vcvtsd2usi(gzA, anyptr_gpB);
+ a.vcvtsi2sd(xmmA, xmmB, gdC);
+ a.vcvtsi2sd(xmmA, xmmB, dword_ptr(gzC));
+ if (isX64) a.vcvtsi2sd(xmmA, xmmB, gzC);
+ if (isX64) a.vcvtsi2sd(xmmA, xmmB, qword_ptr(gzC));
+ a.vcvtsi2ss(xmmA, xmmB, gdC);
+ a.vcvtsi2ss(xmmA, xmmB, dword_ptr(gzC));
+ if (isX64) a.vcvtsi2ss(xmmA, xmmB, gzC);
+ if (isX64) a.vcvtsi2ss(xmmA, xmmB, qword_ptr(gzC));
+ a.vcvtss2sd(xmmA, xmmB, xmmC);
+ a.vcvtss2sd(xmmA, xmmB, anyptr_gpC);
+ a.vcvtss2si(gdA, xmmB);
+ a.vcvtss2si(gdA, anyptr_gpB);
+ if (isX64) a.vcvtss2si(gzA, xmmB);
+ if (isX64) a.vcvtss2si(gzA, anyptr_gpB);
+ a.vcvtss2usi(gdA, xmmB);
+ a.vcvtss2usi(gdA, anyptr_gpB);
+ if (isX64) a.vcvtss2usi(gzA, xmmB);
+ if (isX64) a.vcvtss2usi(gzA, anyptr_gpB);
+ a.vcvttpd2dq(xmmA, xmmB);
+ a.vcvttpd2dq(xmmA, anyptr_gpB);
+ a.vcvttpd2dq(xmmA, ymmB);
+ a.vcvttpd2dq(xmmA, anyptr_gpB);
+ a.vcvttpd2dq(ymmA, zmmB);
+ a.vcvttpd2dq(ymmA, anyptr_gpB);
+ a.vcvttpd2qq(xmmA, xmmB);
+ a.vcvttpd2qq(xmmA, anyptr_gpB);
+ a.vcvttpd2qq(ymmA, ymmB);
+ a.vcvttpd2qq(ymmA, anyptr_gpB);
+ a.vcvttpd2qq(zmmA, zmmB);
+ a.vcvttpd2qq(zmmA, anyptr_gpB);
+ a.vcvttpd2udq(xmmA, xmmB);
+ a.vcvttpd2udq(xmmA, anyptr_gpB);
+ a.vcvttpd2udq(xmmA, ymmB);
+ a.vcvttpd2udq(xmmA, anyptr_gpB);
+ a.vcvttpd2udq(ymmA, zmmB);
+ a.vcvttpd2udq(ymmA, anyptr_gpB);
+ a.vcvttpd2uqq(xmmA, xmmB);
+ a.vcvttpd2uqq(xmmA, anyptr_gpB);
+ a.vcvttpd2uqq(ymmA, ymmB);
+ a.vcvttpd2uqq(ymmA, anyptr_gpB);
+ a.vcvttpd2uqq(zmmA, zmmB);
+ a.vcvttpd2uqq(zmmA, anyptr_gpB);
+ a.vcvttps2dq(xmmA, xmmB);
+ a.vcvttps2dq(xmmA, anyptr_gpB);
+ a.vcvttps2dq(ymmA, ymmB);
+ a.vcvttps2dq(ymmA, anyptr_gpB);
+ a.vcvttps2dq(zmmA, zmmB);
+ a.vcvttps2dq(zmmA, anyptr_gpB);
+ a.vcvttps2qq(xmmA, xmmB);
+ a.vcvttps2qq(xmmA, anyptr_gpB);
+ a.vcvttps2qq(ymmA, xmmB);
+ a.vcvttps2qq(ymmA, anyptr_gpB);
+ a.vcvttps2qq(zmmA, ymmB);
+ a.vcvttps2qq(zmmA, anyptr_gpB);
+ a.vcvttps2udq(xmmA, xmmB);
+ a.vcvttps2udq(xmmA, anyptr_gpB);
+ a.vcvttps2udq(ymmA, ymmB);
+ a.vcvttps2udq(ymmA, anyptr_gpB);
+ a.vcvttps2udq(zmmA, zmmB);
+ a.vcvttps2udq(zmmA, anyptr_gpB);
+ a.vcvttps2uqq(xmmA, xmmB);
+ a.vcvttps2uqq(xmmA, anyptr_gpB);
+ a.vcvttps2uqq(ymmA, xmmB);
+ a.vcvttps2uqq(ymmA, anyptr_gpB);
+ a.vcvttps2uqq(zmmA, ymmB);
+ a.vcvttps2uqq(zmmA, anyptr_gpB);
+ a.vcvttsd2si(gdA, xmmB);
+ a.vcvttsd2si(gdA, anyptr_gpB);
+ if (isX64) a.vcvttsd2si(gzA, xmmB);
+ if (isX64) a.vcvttsd2si(gzA, anyptr_gpB);
+ a.vcvttsd2usi(gdA, xmmB);
+ a.vcvttsd2usi(gdA, anyptr_gpB);
+ if (isX64) a.vcvttsd2usi(gzA, xmmB);
+ if (isX64) a.vcvttsd2usi(gzA, anyptr_gpB);
+ a.vcvttss2si(gdA, xmmB);
+ a.vcvttss2si(gdA, anyptr_gpB);
+ if (isX64) a.vcvttss2si(gzA, xmmB);
+ if (isX64) a.vcvttss2si(gzA, anyptr_gpB);
+ a.vcvttss2usi(gdA, xmmB);
+ a.vcvttss2usi(gdA, anyptr_gpB);
+ if (isX64) a.vcvttss2usi(gzA, xmmB);
+ if (isX64) a.vcvttss2usi(gzA, anyptr_gpB);
+ a.vcvtudq2pd(xmmA, xmmB);
+ a.vcvtudq2pd(xmmA, anyptr_gpB);
+ a.vcvtudq2pd(ymmA, xmmB);
+ a.vcvtudq2pd(ymmA, anyptr_gpB);
+ a.vcvtudq2pd(zmmA, ymmB);
+ a.vcvtudq2pd(zmmA, anyptr_gpB);
+ a.vcvtudq2ps(xmmA, xmmB);
+ a.vcvtudq2ps(xmmA, anyptr_gpB);
+ a.vcvtudq2ps(ymmA, ymmB);
+ a.vcvtudq2ps(ymmA, anyptr_gpB);
+ a.vcvtudq2ps(zmmA, zmmB);
+ a.vcvtudq2ps(zmmA, anyptr_gpB);
+ a.vcvtuqq2pd(xmmA, xmmB);
+ a.vcvtuqq2pd(xmmA, anyptr_gpB);
+ a.vcvtuqq2pd(ymmA, ymmB);
+ a.vcvtuqq2pd(ymmA, anyptr_gpB);
+ a.vcvtuqq2pd(zmmA, zmmB);
+ a.vcvtuqq2pd(zmmA, anyptr_gpB);
+ a.vcvtuqq2ps(xmmA, xmmB);
+ a.vcvtuqq2ps(xmmA, anyptr_gpB);
+ a.vcvtuqq2ps(xmmA, ymmB);
+ a.vcvtuqq2ps(xmmA, anyptr_gpB);
+ a.vcvtuqq2ps(ymmA, zmmB);
+ a.vcvtuqq2ps(ymmA, anyptr_gpB);
+ a.vcvtusi2sd(xmmA, xmmB, gdC);
+ a.vcvtusi2sd(xmmA, xmmB, dword_ptr(gzC));
+ if (isX64) a.vcvtusi2sd(xmmA, xmmB, gzC);
+ if (isX64) a.vcvtusi2sd(xmmA, xmmB, qword_ptr(gzC));
+ a.vcvtusi2ss(xmmA, xmmB, gdC);
+ a.vcvtusi2ss(xmmA, xmmB, dword_ptr(gzC));
+ if (isX64) a.vcvtusi2ss(xmmA, xmmB, gzC);
+ if (isX64) a.vcvtusi2ss(xmmA, xmmB, qword_ptr(gzC));
+ a.vdbpsadbw(xmmA, xmmB, xmmC, 0);
+ a.vdbpsadbw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vdbpsadbw(ymmA, ymmB, ymmC, 0);
+ a.vdbpsadbw(ymmA, ymmB, anyptr_gpC, 0);
+ a.vdbpsadbw(zmmA, zmmB, zmmC, 0);
+ a.vdbpsadbw(zmmA, zmmB, anyptr_gpC, 0);
+ a.vdivpd(xmmA, xmmB, xmmC);
+ a.vdivpd(xmmA, xmmB, anyptr_gpC);
+ a.vdivpd(ymmA, ymmB, ymmC);
+ a.vdivpd(ymmA, ymmB, anyptr_gpC);
+ a.vdivpd(zmmA, zmmB, zmmC);
+ a.vdivpd(zmmA, zmmB, anyptr_gpC);
+ a.vdivps(xmmA, xmmB, xmmC);
+ a.vdivps(xmmA, xmmB, anyptr_gpC);
+ a.vdivps(ymmA, ymmB, ymmC);
+ a.vdivps(ymmA, ymmB, anyptr_gpC);
+ a.vdivps(zmmA, zmmB, zmmC);
+ a.vdivps(zmmA, zmmB, anyptr_gpC);
+ a.vdivsd(xmmA, xmmB, xmmC);
+ a.vdivsd(xmmA, xmmB, anyptr_gpC);
+ a.vdivss(xmmA, xmmB, xmmC);
+ a.vdivss(xmmA, xmmB, anyptr_gpC);
+ a.vexp2pd(zmmA, zmmB);
+ a.vexp2pd(zmmA, anyptr_gpB);
+ a.vexp2ps(zmmA, zmmB);
+ a.vexp2ps(zmmA, anyptr_gpB);
+ a.vexpandpd(xmmA, xmmB);
+ a.vexpandpd(xmmA, anyptr_gpB);
+ a.vexpandpd(ymmA, ymmB);
+ a.vexpandpd(ymmA, anyptr_gpB);
+ a.vexpandpd(zmmA, zmmB);
+ a.vexpandpd(zmmA, anyptr_gpB);
+ a.vexpandps(xmmA, xmmB);
+ a.vexpandps(xmmA, anyptr_gpB);
+ a.vexpandps(ymmA, ymmB);
+ a.vexpandps(ymmA, anyptr_gpB);
+ a.vexpandps(zmmA, zmmB);
+ a.vexpandps(zmmA, anyptr_gpB);
+ a.vextractf32x4(xmmA, ymmB, 0);
+ a.vextractf32x4(anyptr_gpA, ymmB, 0);
+ a.vextractf32x4(xmmA, zmmB, 0);
+ a.vextractf32x4(anyptr_gpA, zmmB, 0);
+ a.vextractf32x8(ymmA, zmmB, 0);
+ a.vextractf32x8(anyptr_gpA, zmmB, 0);
+ a.vextractf64x2(xmmA, ymmB, 0);
+ a.vextractf64x2(anyptr_gpA, ymmB, 0);
+ a.vextractf64x2(xmmA, zmmB, 0);
+ a.vextractf64x2(anyptr_gpA, zmmB, 0);
+ a.vextractf64x4(ymmA, zmmB, 0);
+ a.vextractf64x4(anyptr_gpA, zmmB, 0);
+ a.vextracti32x4(xmmA, ymmB, 0);
+ a.vextracti32x4(anyptr_gpA, ymmB, 0);
+ a.vextracti32x4(xmmA, zmmB, 0);
+ a.vextracti32x4(anyptr_gpA, zmmB, 0);
+ a.vextracti32x8(ymmA, zmmB, 0);
+ a.vextracti32x8(anyptr_gpA, zmmB, 0);
+ a.vextracti64x2(xmmA, ymmB, 0);
+ a.vextracti64x2(anyptr_gpA, ymmB, 0);
+ a.vextracti64x2(xmmA, zmmB, 0);
+ a.vextracti64x2(anyptr_gpA, zmmB, 0);
+ a.vextracti64x4(ymmA, zmmB, 0);
+ a.vextracti64x4(anyptr_gpA, zmmB, 0);
+ a.vextractps(gdA, xmmB, 0);
+ a.vextractps(gzA, xmmB, 0);
+ a.vextractps(anyptr_gpA, xmmB, 0);
+ a.vfixupimmpd(xmmA, xmmB, xmmC, 0);
+ a.vfixupimmpd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vfixupimmpd(ymmA, ymmB, ymmC, 0);
+ a.vfixupimmpd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vfixupimmpd(zmmA, zmmB, zmmC, 0);
+ a.vfixupimmpd(zmmA, zmmB, anyptr_gpC, 0);
+ a.vfixupimmps(xmmA, xmmB, xmmC, 0);
+ a.vfixupimmps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vfixupimmps(ymmA, ymmB, ymmC, 0);
+ a.vfixupimmps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vfixupimmps(zmmA, zmmB, zmmC, 0);
+ a.vfixupimmps(zmmA, zmmB, anyptr_gpC, 0);
+ a.vfixupimmsd(xmmA, xmmB, xmmC, 0);
+ a.vfixupimmsd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vfixupimmss(xmmA, xmmB, xmmC, 0);
+ a.vfixupimmss(xmmA, xmmB, anyptr_gpC, 0);
+ a.vfmadd132pd(xmmA, xmmB, xmmC);
+ a.vfmadd132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132pd(ymmA, ymmB, ymmC);
+ a.vfmadd132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd132pd(zmmA, zmmB, zmmC);
+ a.vfmadd132pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmadd132ps(xmmA, xmmB, xmmC);
+ a.vfmadd132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132ps(ymmA, ymmB, ymmC);
+ a.vfmadd132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd132ps(zmmA, zmmB, zmmC);
+ a.vfmadd132ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmadd132sd(xmmA, xmmB, xmmC);
+ a.vfmadd132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd132ss(xmmA, xmmB, xmmC);
+ a.vfmadd132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213pd(xmmA, xmmB, xmmC);
+ a.vfmadd213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213pd(ymmA, ymmB, ymmC);
+ a.vfmadd213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd213pd(zmmA, zmmB, zmmC);
+ a.vfmadd213pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmadd213ps(xmmA, xmmB, xmmC);
+ a.vfmadd213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213ps(ymmA, ymmB, ymmC);
+ a.vfmadd213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd213ps(zmmA, zmmB, zmmC);
+ a.vfmadd213ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmadd213sd(xmmA, xmmB, xmmC);
+ a.vfmadd213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd213ss(xmmA, xmmB, xmmC);
+ a.vfmadd213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231pd(xmmA, xmmB, xmmC);
+ a.vfmadd231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231pd(ymmA, ymmB, ymmC);
+ a.vfmadd231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd231pd(zmmA, zmmB, zmmC);
+ a.vfmadd231pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmadd231ps(xmmA, xmmB, xmmC);
+ a.vfmadd231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231ps(ymmA, ymmB, ymmC);
+ a.vfmadd231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmadd231ps(zmmA, zmmB, zmmC);
+ a.vfmadd231ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmadd231sd(xmmA, xmmB, xmmC);
+ a.vfmadd231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmadd231ss(xmmA, xmmB, xmmC);
+ a.vfmadd231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub132pd(xmmA, xmmB, xmmC);
+ a.vfmaddsub132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub132pd(ymmA, ymmB, ymmC);
+ a.vfmaddsub132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub132pd(zmmA, zmmB, zmmC);
+ a.vfmaddsub132pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmaddsub132ps(xmmA, xmmB, xmmC);
+ a.vfmaddsub132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub132ps(ymmA, ymmB, ymmC);
+ a.vfmaddsub132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub132ps(zmmA, zmmB, zmmC);
+ a.vfmaddsub132ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmaddsub213pd(xmmA, xmmB, xmmC);
+ a.vfmaddsub213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub213pd(ymmA, ymmB, ymmC);
+ a.vfmaddsub213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub213pd(zmmA, zmmB, zmmC);
+ a.vfmaddsub213pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmaddsub213ps(xmmA, xmmB, xmmC);
+ a.vfmaddsub213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub213ps(ymmA, ymmB, ymmC);
+ a.vfmaddsub213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub213ps(zmmA, zmmB, zmmC);
+ a.vfmaddsub213ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmaddsub231pd(xmmA, xmmB, xmmC);
+ a.vfmaddsub231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub231pd(ymmA, ymmB, ymmC);
+ a.vfmaddsub231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub231pd(zmmA, zmmB, zmmC);
+ a.vfmaddsub231pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmaddsub231ps(xmmA, xmmB, xmmC);
+ a.vfmaddsub231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmaddsub231ps(ymmA, ymmB, ymmC);
+ a.vfmaddsub231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmaddsub231ps(zmmA, zmmB, zmmC);
+ a.vfmaddsub231ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub132pd(xmmA, xmmB, xmmC);
+ a.vfmsub132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132pd(ymmA, ymmB, ymmC);
+ a.vfmsub132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub132pd(zmmA, zmmB, zmmC);
+ a.vfmsub132pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub132ps(xmmA, xmmB, xmmC);
+ a.vfmsub132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132ps(ymmA, ymmB, ymmC);
+ a.vfmsub132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub132ps(zmmA, zmmB, zmmC);
+ a.vfmsub132ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub132sd(xmmA, xmmB, xmmC);
+ a.vfmsub132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub132ss(xmmA, xmmB, xmmC);
+ a.vfmsub132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213pd(xmmA, xmmB, xmmC);
+ a.vfmsub213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213pd(ymmA, ymmB, ymmC);
+ a.vfmsub213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub213pd(zmmA, zmmB, zmmC);
+ a.vfmsub213pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub213ps(xmmA, xmmB, xmmC);
+ a.vfmsub213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213ps(ymmA, ymmB, ymmC);
+ a.vfmsub213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub213ps(zmmA, zmmB, zmmC);
+ a.vfmsub213ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub213sd(xmmA, xmmB, xmmC);
+ a.vfmsub213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub213ss(xmmA, xmmB, xmmC);
+ a.vfmsub213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231pd(xmmA, xmmB, xmmC);
+ a.vfmsub231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231pd(ymmA, ymmB, ymmC);
+ a.vfmsub231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub231pd(zmmA, zmmB, zmmC);
+ a.vfmsub231pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub231ps(xmmA, xmmB, xmmC);
+ a.vfmsub231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231ps(ymmA, ymmB, ymmC);
+ a.vfmsub231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsub231ps(zmmA, zmmB, zmmC);
+ a.vfmsub231ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmsub231sd(xmmA, xmmB, xmmC);
+ a.vfmsub231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsub231ss(xmmA, xmmB, xmmC);
+ a.vfmsub231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd132pd(xmmA, xmmB, xmmC);
+ a.vfmsubadd132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd132pd(ymmA, ymmB, ymmC);
+ a.vfmsubadd132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd132pd(zmmA, zmmB, zmmC);
+ a.vfmsubadd132pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmsubadd132ps(xmmA, xmmB, xmmC);
+ a.vfmsubadd132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd132ps(ymmA, ymmB, ymmC);
+ a.vfmsubadd132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd132ps(zmmA, zmmB, zmmC);
+ a.vfmsubadd132ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmsubadd213pd(xmmA, xmmB, xmmC);
+ a.vfmsubadd213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd213pd(ymmA, ymmB, ymmC);
+ a.vfmsubadd213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd213pd(zmmA, zmmB, zmmC);
+ a.vfmsubadd213pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmsubadd213ps(xmmA, xmmB, xmmC);
+ a.vfmsubadd213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd213ps(ymmA, ymmB, ymmC);
+ a.vfmsubadd213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd213ps(zmmA, zmmB, zmmC);
+ a.vfmsubadd213ps(zmmA, zmmB, anyptr_gpC);
+ a.vfmsubadd231pd(xmmA, xmmB, xmmC);
+ a.vfmsubadd231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd231pd(ymmA, ymmB, ymmC);
+ a.vfmsubadd231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd231pd(zmmA, zmmB, zmmC);
+ a.vfmsubadd231pd(zmmA, zmmB, anyptr_gpC);
+ a.vfmsubadd231ps(xmmA, xmmB, xmmC);
+ a.vfmsubadd231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfmsubadd231ps(ymmA, ymmB, ymmC);
+ a.vfmsubadd231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfmsubadd231ps(zmmA, zmmB, zmmC);
+ a.vfmsubadd231ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd132pd(xmmA, xmmB, xmmC);
+ a.vfnmadd132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132pd(ymmA, ymmB, ymmC);
+ a.vfnmadd132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd132pd(zmmA, zmmB, zmmC);
+ a.vfnmadd132pd(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd132ps(xmmA, xmmB, xmmC);
+ a.vfnmadd132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132ps(ymmA, ymmB, ymmC);
+ a.vfnmadd132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd132ps(zmmA, zmmB, zmmC);
+ a.vfnmadd132ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd132sd(xmmA, xmmB, xmmC);
+ a.vfnmadd132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd132ss(xmmA, xmmB, xmmC);
+ a.vfnmadd132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213pd(xmmA, xmmB, xmmC);
+ a.vfnmadd213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213pd(ymmA, ymmB, ymmC);
+ a.vfnmadd213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd213pd(zmmA, zmmB, zmmC);
+ a.vfnmadd213pd(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd213ps(xmmA, xmmB, xmmC);
+ a.vfnmadd213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213ps(ymmA, ymmB, ymmC);
+ a.vfnmadd213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd213ps(zmmA, zmmB, zmmC);
+ a.vfnmadd213ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd213sd(xmmA, xmmB, xmmC);
+ a.vfnmadd213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd213ss(xmmA, xmmB, xmmC);
+ a.vfnmadd213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231pd(xmmA, xmmB, xmmC);
+ a.vfnmadd231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231pd(ymmA, ymmB, ymmC);
+ a.vfnmadd231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd231pd(zmmA, zmmB, zmmC);
+ a.vfnmadd231pd(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd231ps(xmmA, xmmB, xmmC);
+ a.vfnmadd231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231ps(ymmA, ymmB, ymmC);
+ a.vfnmadd231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmadd231ps(zmmA, zmmB, zmmC);
+ a.vfnmadd231ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmadd231sd(xmmA, xmmB, xmmC);
+ a.vfnmadd231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmadd231ss(xmmA, xmmB, xmmC);
+ a.vfnmadd231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132pd(xmmA, xmmB, xmmC);
+ a.vfnmsub132pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132pd(ymmA, ymmB, ymmC);
+ a.vfnmsub132pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub132pd(zmmA, zmmB, zmmC);
+ a.vfnmsub132pd(zmmA, zmmB, anyptr_gpC);
+ a.vfnmsub132ps(xmmA, xmmB, xmmC);
+ a.vfnmsub132ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132ps(ymmA, ymmB, ymmC);
+ a.vfnmsub132ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub132ps(zmmA, zmmB, zmmC);
+ a.vfnmsub132ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmsub132sd(xmmA, xmmB, xmmC);
+ a.vfnmsub132sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub132ss(xmmA, xmmB, xmmC);
+ a.vfnmsub132ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213pd(xmmA, xmmB, xmmC);
+ a.vfnmsub213pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213pd(ymmA, ymmB, ymmC);
+ a.vfnmsub213pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub213pd(zmmA, zmmB, zmmC);
+ a.vfnmsub213pd(zmmA, zmmB, anyptr_gpC);
+ a.vfnmsub213ps(xmmA, xmmB, xmmC);
+ a.vfnmsub213ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213ps(ymmA, ymmB, ymmC);
+ a.vfnmsub213ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub213ps(zmmA, zmmB, zmmC);
+ a.vfnmsub213ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmsub213sd(xmmA, xmmB, xmmC);
+ a.vfnmsub213sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub213ss(xmmA, xmmB, xmmC);
+ a.vfnmsub213ss(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231pd(xmmA, xmmB, xmmC);
+ a.vfnmsub231pd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231pd(ymmA, ymmB, ymmC);
+ a.vfnmsub231pd(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub231pd(zmmA, zmmB, zmmC);
+ a.vfnmsub231pd(zmmA, zmmB, anyptr_gpC);
+ a.vfnmsub231ps(xmmA, xmmB, xmmC);
+ a.vfnmsub231ps(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231ps(ymmA, ymmB, ymmC);
+ a.vfnmsub231ps(ymmA, ymmB, anyptr_gpC);
+ a.vfnmsub231ps(zmmA, zmmB, zmmC);
+ a.vfnmsub231ps(zmmA, zmmB, anyptr_gpC);
+ a.vfnmsub231sd(xmmA, xmmB, xmmC);
+ a.vfnmsub231sd(xmmA, xmmB, anyptr_gpC);
+ a.vfnmsub231ss(xmmA, xmmB, xmmC);
+ a.vfnmsub231ss(xmmA, xmmB, anyptr_gpC);
+ a.vfpclasspd(kA, xmmB, 0);
+ a.vfpclasspd(kA, anyptr_gpB, 0);
+ a.vfpclasspd(kA, ymmB, 0);
+ a.vfpclasspd(kA, anyptr_gpB, 0);
+ a.vfpclasspd(kA, zmmB, 0);
+ a.vfpclasspd(kA, anyptr_gpB, 0);
+ a.vfpclassps(kA, xmmB, 0);
+ a.vfpclassps(kA, anyptr_gpB, 0);
+ a.vfpclassps(kA, ymmB, 0);
+ a.vfpclassps(kA, anyptr_gpB, 0);
+ a.vfpclassps(kA, zmmB, 0);
+ a.vfpclassps(kA, anyptr_gpB, 0);
+ a.vfpclasssd(kA, xmmB, 0);
+ a.vfpclasssd(kA, anyptr_gpB, 0);
+ a.vfpclassss(kA, xmmB, 0);
+ a.vfpclassss(kA, anyptr_gpB, 0);
+ a.vgatherdpd(xmmA, vx_ptr);
+ a.vgatherdpd(ymmA, vy_ptr);
+ a.vgatherdpd(zmmA, vz_ptr);
+ a.vgatherdps(xmmA, vx_ptr);
+ a.vgatherdps(ymmA, vy_ptr);
+ a.vgatherdps(zmmA, vz_ptr);
+ a.vgatherpf0dpd(vy_ptr);
+ a.vgatherpf0dps(vz_ptr);
+ a.vgatherpf0qpd(vz_ptr);
+ a.vgatherpf0qps(vz_ptr);
+ a.vgatherpf1dpd(vy_ptr);
+ a.vgatherpf1dps(vz_ptr);
+ a.vgatherpf1qpd(vz_ptr);
+ a.vgatherpf1qps(vz_ptr);
+ a.vgatherqpd(xmmA, vx_ptr);
+ a.vgatherqpd(ymmA, vy_ptr);
+ a.vgatherqpd(zmmA, vz_ptr);
+ a.vgatherqps(xmmA, vx_ptr);
+ a.vgatherqps(ymmA, vy_ptr);
+ a.vgatherqps(zmmA, vz_ptr);
+ a.vgetexppd(xmmA, xmmB);
+ a.vgetexppd(xmmA, anyptr_gpB);
+ a.vgetexppd(ymmA, ymmB);
+ a.vgetexppd(ymmA, anyptr_gpB);
+ a.vgetexppd(zmmA, zmmB);
+ a.vgetexppd(zmmA, anyptr_gpB);
+ a.vgetexpps(xmmA, xmmB);
+ a.vgetexpps(xmmA, anyptr_gpB);
+ a.vgetexpps(ymmA, ymmB);
+ a.vgetexpps(ymmA, anyptr_gpB);
+ a.vgetexpps(zmmA, zmmB);
+ a.vgetexpps(zmmA, anyptr_gpB);
+ a.vgetexpsd(xmmA, xmmB);
+ a.vgetexpsd(xmmA, anyptr_gpB);
+ a.vgetexpss(xmmA, xmmB);
+ a.vgetexpss(xmmA, anyptr_gpB);
+ a.vgetmantpd(xmmA, xmmB, 0);
+ a.vgetmantpd(xmmA, anyptr_gpB, 0);
+ a.vgetmantpd(ymmA, ymmB, 0);
+ a.vgetmantpd(ymmA, anyptr_gpB, 0);
+ a.vgetmantpd(zmmA, zmmB, 0);
+ a.vgetmantpd(zmmA, anyptr_gpB, 0);
+ a.vgetmantps(xmmA, xmmB, 0);
+ a.vgetmantps(xmmA, anyptr_gpB, 0);
+ a.vgetmantps(ymmA, ymmB, 0);
+ a.vgetmantps(ymmA, anyptr_gpB, 0);
+ a.vgetmantps(zmmA, zmmB, 0);
+ a.vgetmantps(zmmA, anyptr_gpB, 0);
+ a.vgetmantsd(xmmA, xmmB, 0);
+ a.vgetmantsd(xmmA, anyptr_gpB, 0);
+ a.vgetmantss(xmmA, xmmB, 0);
+ a.vgetmantss(xmmA, anyptr_gpB, 0);
+ a.vinsertf32x4(ymmA, ymmB, xmmC, 0);
+ a.vinsertf32x4(ymmA, ymmB, anyptr_gpC, 0);
+ a.vinsertf32x4(zmmA, zmmB, xmmC, 0);
+ a.vinsertf32x4(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinsertf32x8(zmmA, zmmB, ymmC, 0);
+ a.vinsertf32x8(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinsertf64x2(ymmA, ymmB, xmmC, 0);
+ a.vinsertf64x2(ymmA, ymmB, anyptr_gpC, 0);
+ a.vinsertf64x2(zmmA, zmmB, xmmC, 0);
+ a.vinsertf64x2(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinsertf64x4(zmmA, zmmB, ymmC, 0);
+ a.vinsertf64x4(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinserti32x4(ymmA, ymmB, xmmC, 0);
+ a.vinserti32x4(ymmA, ymmB, anyptr_gpC, 0);
+ a.vinserti32x4(zmmA, zmmB, xmmC, 0);
+ a.vinserti32x4(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinserti32x8(zmmA, zmmB, ymmC, 0);
+ a.vinserti32x8(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinserti64x2(ymmA, ymmB, xmmC, 0);
+ a.vinserti64x2(ymmA, ymmB, anyptr_gpC, 0);
+ a.vinserti64x2(zmmA, zmmB, xmmC, 0);
+ a.vinserti64x2(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinserti64x4(zmmA, zmmB, ymmC, 0);
+ a.vinserti64x4(zmmA, zmmB, anyptr_gpC, 0);
+ a.vinsertps(xmmA, xmmB, xmmC, 0);
+ a.vinsertps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vmaxpd(xmmA, xmmB, xmmC);
+ a.vmaxpd(xmmA, xmmB, anyptr_gpC);
+ a.vmaxpd(ymmA, ymmB, ymmC);
+ a.vmaxpd(ymmA, ymmB, anyptr_gpC);
+ a.vmaxpd(zmmA, zmmB, zmmC);
+ a.vmaxpd(zmmA, zmmB, anyptr_gpC);
+ a.vmaxps(xmmA, xmmB, xmmC);
+ a.vmaxps(xmmA, xmmB, anyptr_gpC);
+ a.vmaxps(ymmA, ymmB, ymmC);
+ a.vmaxps(ymmA, ymmB, anyptr_gpC);
+ a.vmaxps(zmmA, zmmB, zmmC);
+ a.vmaxps(zmmA, zmmB, anyptr_gpC);
+ a.vmaxsd(xmmA, xmmB, xmmC);
+ a.vmaxsd(xmmA, xmmB, anyptr_gpC);
+ a.vmaxss(xmmA, xmmB, xmmC);
+ a.vmaxss(xmmA, xmmB, anyptr_gpC);
+ a.vminpd(xmmA, xmmB, xmmC);
+ a.vminpd(xmmA, xmmB, anyptr_gpC);
+ a.vminpd(ymmA, ymmB, ymmC);
+ a.vminpd(ymmA, ymmB, anyptr_gpC);
+ a.vminpd(zmmA, zmmB, zmmC);
+ a.vminpd(zmmA, zmmB, anyptr_gpC);
+ a.vminps(xmmA, xmmB, xmmC);
+ a.vminps(xmmA, xmmB, anyptr_gpC);
+ a.vminps(ymmA, ymmB, ymmC);
+ a.vminps(ymmA, ymmB, anyptr_gpC);
+ a.vminps(zmmA, zmmB, zmmC);
+ a.vminps(zmmA, zmmB, anyptr_gpC);
+ a.vminsd(xmmA, xmmB, xmmC);
+ a.vminsd(xmmA, xmmB, anyptr_gpC);
+ a.vminss(xmmA, xmmB, xmmC);
+ a.vminss(xmmA, xmmB, anyptr_gpC);
+ a.vmovapd(xmmA, xmmB);
+ a.vmovapd(xmmA, anyptr_gpB);
+ a.vmovapd(xmmA, xmmB);
+ a.vmovapd(anyptr_gpA, xmmB);
+ a.vmovapd(ymmA, ymmB);
+ a.vmovapd(ymmA, anyptr_gpB);
+ a.vmovapd(ymmA, ymmB);
+ a.vmovapd(anyptr_gpA, ymmB);
+ a.vmovapd(zmmA, zmmB);
+ a.vmovapd(zmmA, anyptr_gpB);
+ a.vmovapd(zmmA, zmmB);
+ a.vmovapd(anyptr_gpA, zmmB);
+ a.vmovaps(xmmA, xmmB);
+ a.vmovaps(xmmA, anyptr_gpB);
+ a.vmovaps(xmmA, xmmB);
+ a.vmovaps(anyptr_gpA, xmmB);
+ a.vmovaps(ymmA, ymmB);
+ a.vmovaps(ymmA, anyptr_gpB);
+ a.vmovaps(ymmA, ymmB);
+ a.vmovaps(anyptr_gpA, ymmB);
+ a.vmovaps(zmmA, zmmB);
+ a.vmovaps(zmmA, anyptr_gpB);
+ a.vmovaps(zmmA, zmmB);
+ a.vmovaps(anyptr_gpA, zmmB);
+ a.vmovd(gdA, xmmB);
+ a.vmovd(gzA, xmmB);
+ a.vmovd(anyptr_gpA, xmmB);
+ a.vmovd(xmmA, gdB);
+ a.vmovd(xmmA, gzB);
+ a.vmovd(xmmA, anyptr_gpB);
+ a.vmovddup(xmmA, xmmB);
+ a.vmovddup(xmmA, anyptr_gpB);
+ a.vmovddup(ymmA, ymmB);
+ a.vmovddup(ymmA, anyptr_gpB);
+ a.vmovddup(zmmA, zmmB);
+ a.vmovddup(zmmA, anyptr_gpB);
+ a.vmovdqa32(xmmA, xmmB);
+ a.vmovdqa32(xmmA, anyptr_gpB);
+ a.vmovdqa32(xmmA, xmmB);
+ a.vmovdqa32(anyptr_gpA, xmmB);
+ a.vmovdqa32(ymmA, ymmB);
+ a.vmovdqa32(ymmA, anyptr_gpB);
+ a.vmovdqa32(ymmA, ymmB);
+ a.vmovdqa32(anyptr_gpA, ymmB);
+ a.vmovdqa32(zmmA, zmmB);
+ a.vmovdqa32(zmmA, anyptr_gpB);
+ a.vmovdqa32(zmmA, zmmB);
+ a.vmovdqa32(anyptr_gpA, zmmB);
+ a.vmovdqa64(xmmA, xmmB);
+ a.vmovdqa64(xmmA, anyptr_gpB);
+ a.vmovdqa64(xmmA, xmmB);
+ a.vmovdqa64(anyptr_gpA, xmmB);
+ a.vmovdqa64(ymmA, ymmB);
+ a.vmovdqa64(ymmA, anyptr_gpB);
+ a.vmovdqa64(ymmA, ymmB);
+ a.vmovdqa64(anyptr_gpA, ymmB);
+ a.vmovdqa64(zmmA, zmmB);
+ a.vmovdqa64(zmmA, anyptr_gpB);
+ a.vmovdqa64(zmmA, zmmB);
+ a.vmovdqa64(anyptr_gpA, zmmB);
+ a.vmovdqu16(xmmA, xmmB);
+ a.vmovdqu16(xmmA, anyptr_gpB);
+ a.vmovdqu16(xmmA, xmmB);
+ a.vmovdqu16(anyptr_gpA, xmmB);
+ a.vmovdqu16(ymmA, ymmB);
+ a.vmovdqu16(ymmA, anyptr_gpB);
+ a.vmovdqu16(ymmA, ymmB);
+ a.vmovdqu16(anyptr_gpA, ymmB);
+ a.vmovdqu16(zmmA, zmmB);
+ a.vmovdqu16(zmmA, anyptr_gpB);
+ a.vmovdqu16(zmmA, zmmB);
+ a.vmovdqu16(anyptr_gpA, zmmB);
+ a.vmovdqu32(xmmA, xmmB);
+ a.vmovdqu32(xmmA, anyptr_gpB);
+ a.vmovdqu32(xmmA, xmmB);
+ a.vmovdqu32(anyptr_gpA, xmmB);
+ a.vmovdqu32(ymmA, ymmB);
+ a.vmovdqu32(ymmA, anyptr_gpB);
+ a.vmovdqu32(ymmA, ymmB);
+ a.vmovdqu32(anyptr_gpA, ymmB);
+ a.vmovdqu32(zmmA, zmmB);
+ a.vmovdqu32(zmmA, anyptr_gpB);
+ a.vmovdqu32(zmmA, zmmB);
+ a.vmovdqu32(anyptr_gpA, zmmB);
+ a.vmovdqu64(xmmA, xmmB);
+ a.vmovdqu64(xmmA, anyptr_gpB);
+ a.vmovdqu64(xmmA, xmmB);
+ a.vmovdqu64(anyptr_gpA, xmmB);
+ a.vmovdqu64(ymmA, ymmB);
+ a.vmovdqu64(ymmA, anyptr_gpB);
+ a.vmovdqu64(ymmA, ymmB);
+ a.vmovdqu64(anyptr_gpA, ymmB);
+ a.vmovdqu64(zmmA, zmmB);
+ a.vmovdqu64(zmmA, anyptr_gpB);
+ a.vmovdqu64(zmmA, zmmB);
+ a.vmovdqu64(anyptr_gpA, zmmB);
+ a.vmovdqu8(xmmA, xmmB);
+ a.vmovdqu8(xmmA, anyptr_gpB);
+ a.vmovdqu8(xmmA, xmmB);
+ a.vmovdqu8(anyptr_gpA, xmmB);
+ a.vmovdqu8(ymmA, ymmB);
+ a.vmovdqu8(ymmA, anyptr_gpB);
+ a.vmovdqu8(ymmA, ymmB);
+ a.vmovdqu8(anyptr_gpA, ymmB);
+ a.vmovdqu8(zmmA, zmmB);
+ a.vmovdqu8(zmmA, anyptr_gpB);
+ a.vmovdqu8(zmmA, zmmB);
+ a.vmovdqu8(anyptr_gpA, zmmB);
+ a.vmovhlps(xmmA, xmmB, xmmC);
+ a.vmovhpd(anyptr_gpA, xmmB);
+ a.vmovhpd(xmmA, xmmB, anyptr_gpC);
+ a.vmovhps(anyptr_gpA, xmmB);
+ a.vmovhps(xmmA, xmmB, anyptr_gpC);
+ a.vmovlhps(xmmA, xmmB, xmmC);
+ a.vmovlpd(anyptr_gpA, xmmB);
+ a.vmovlpd(xmmA, xmmB, anyptr_gpC);
+ a.vmovlps(anyptr_gpA, xmmB);
+ a.vmovlps(xmmA, xmmB, anyptr_gpC);
+ a.vmovntdq(anyptr_gpA, xmmB);
+ a.vmovntdq(anyptr_gpA, ymmB);
+ a.vmovntdq(anyptr_gpA, zmmB);
+ a.vmovntdqa(xmmA, anyptr_gpB);
+ a.vmovntdqa(ymmA, anyptr_gpB);
+ a.vmovntdqa(zmmA, anyptr_gpB);
+ a.vmovntpd(anyptr_gpA, xmmB);
+ a.vmovntpd(anyptr_gpA, ymmB);
+ a.vmovntpd(anyptr_gpA, zmmB);
+ a.vmovntps(anyptr_gpA, xmmB);
+ a.vmovntps(anyptr_gpA, ymmB);
+ a.vmovntps(anyptr_gpA, zmmB);
+ if (isX64) a.vmovq(gzA, xmmB);
+ if (isX64) a.vmovq(xmmA, gzB);
+ a.vmovq(anyptr_gpA, xmmB);
+ a.vmovq(xmmA, anyptr_gpB);
+ a.vmovq(xmmA, xmmB);
+ a.vmovq(xmmA, anyptr_gpB);
+ a.vmovq(xmmA, xmmB);
+ a.vmovq(anyptr_gpA, xmmB);
+ a.vmovsd(anyptr_gpA, xmmB);
+ a.vmovsd(xmmA, anyptr_gpB);
+ a.vmovsd(xmmA, xmmB, xmmC);
+ a.vmovsd(xmmA, xmmB, xmmC);
+ a.vmovshdup(xmmA, xmmB);
+ a.vmovshdup(xmmA, anyptr_gpB);
+ a.vmovshdup(ymmA, ymmB);
+ a.vmovshdup(ymmA, anyptr_gpB);
+ a.vmovshdup(zmmA, zmmB);
+ a.vmovshdup(zmmA, anyptr_gpB);
+ a.vmovsldup(xmmA, xmmB);
+ a.vmovsldup(xmmA, anyptr_gpB);
+ a.vmovsldup(ymmA, ymmB);
+ a.vmovsldup(ymmA, anyptr_gpB);
+ a.vmovsldup(zmmA, zmmB);
+ a.vmovsldup(zmmA, anyptr_gpB);
+ a.vmovss(anyptr_gpA, xmmB);
+ a.vmovss(xmmA, anyptr_gpB);
+ a.vmovss(xmmA, xmmB, xmmC);
+ a.vmovss(xmmA, xmmB, xmmC);
+ a.vmovupd(xmmA, xmmB);
+ a.vmovupd(xmmA, anyptr_gpB);
+ a.vmovupd(xmmA, xmmB);
+ a.vmovupd(anyptr_gpA, xmmB);
+ a.vmovupd(ymmA, ymmB);
+ a.vmovupd(ymmA, anyptr_gpB);
+ a.vmovupd(ymmA, ymmB);
+ a.vmovupd(anyptr_gpA, ymmB);
+ a.vmovupd(zmmA, zmmB);
+ a.vmovupd(zmmA, anyptr_gpB);
+ a.vmovupd(zmmA, zmmB);
+ a.vmovupd(anyptr_gpA, zmmB);
+ a.vmovups(xmmA, xmmB);
+ a.vmovups(xmmA, anyptr_gpB);
+ a.vmovups(xmmA, xmmB);
+ a.vmovups(anyptr_gpA, xmmB);
+ a.vmovups(ymmA, ymmB);
+ a.vmovups(ymmA, anyptr_gpB);
+ a.vmovups(ymmA, ymmB);
+ a.vmovups(anyptr_gpA, ymmB);
+ a.vmovups(zmmA, zmmB);
+ a.vmovups(zmmA, anyptr_gpB);
+ a.vmovups(zmmA, zmmB);
+ a.vmovups(anyptr_gpA, zmmB);
+ a.vmulpd(xmmA, xmmB, xmmC);
+ a.vmulpd(xmmA, xmmB, anyptr_gpC);
+ a.vmulpd(ymmA, ymmB, ymmC);
+ a.vmulpd(ymmA, ymmB, anyptr_gpC);
+ a.vmulpd(zmmA, zmmB, zmmC);
+ a.vmulpd(zmmA, zmmB, anyptr_gpC);
+ a.vmulps(xmmA, xmmB, xmmC);
+ a.vmulps(xmmA, xmmB, anyptr_gpC);
+ a.vmulps(ymmA, ymmB, ymmC);
+ a.vmulps(ymmA, ymmB, anyptr_gpC);
+ a.vmulps(zmmA, zmmB, zmmC);
+ a.vmulps(zmmA, zmmB, anyptr_gpC);
+ a.vmulsd(xmmA, xmmB, xmmC);
+ a.vmulsd(xmmA, xmmB, anyptr_gpC);
+ a.vmulss(xmmA, xmmB, xmmC);
+ a.vmulss(xmmA, xmmB, anyptr_gpC);
+ a.vorpd(xmmA, xmmB, xmmC);
+ a.vorpd(xmmA, xmmB, anyptr_gpC);
+ a.vorpd(ymmA, ymmB, ymmC);
+ a.vorpd(ymmA, ymmB, anyptr_gpC);
+ a.vorpd(zmmA, zmmB, zmmC);
+ a.vorpd(zmmA, zmmB, anyptr_gpC);
+ a.vorps(xmmA, xmmB, xmmC);
+ a.vorps(xmmA, xmmB, anyptr_gpC);
+ a.vorps(ymmA, ymmB, ymmC);
+ a.vorps(ymmA, ymmB, anyptr_gpC);
+ a.vorps(zmmA, zmmB, zmmC);
+ a.vorps(zmmA, zmmB, anyptr_gpC);
+ a.vpabsb(xmmA, xmmB);
+ a.vpabsb(xmmA, anyptr_gpB);
+ a.vpabsb(ymmA, ymmB);
+ a.vpabsb(ymmA, anyptr_gpB);
+ a.vpabsb(zmmA, zmmB);
+ a.vpabsb(zmmA, anyptr_gpB);
+ a.vpabsd(xmmA, xmmB);
+ a.vpabsd(xmmA, anyptr_gpB);
+ a.vpabsd(ymmA, ymmB);
+ a.vpabsd(ymmA, anyptr_gpB);
+ a.vpabsd(zmmA, zmmB);
+ a.vpabsd(zmmA, anyptr_gpB);
+ a.vpabsq(xmmA, xmmB);
+ a.vpabsq(xmmA, anyptr_gpB);
+ a.vpabsq(ymmA, ymmB);
+ a.vpabsq(ymmA, anyptr_gpB);
+ a.vpabsq(zmmA, zmmB);
+ a.vpabsq(zmmA, anyptr_gpB);
+ a.vpabsw(xmmA, xmmB);
+ a.vpabsw(xmmA, anyptr_gpB);
+ a.vpabsw(ymmA, ymmB);
+ a.vpabsw(ymmA, anyptr_gpB);
+ a.vpabsw(zmmA, zmmB);
+ a.vpabsw(zmmA, anyptr_gpB);
+ a.vpackssdw(xmmA, xmmB, xmmC);
+ a.vpackssdw(xmmA, xmmB, anyptr_gpC);
+ a.vpackssdw(ymmA, ymmB, ymmC);
+ a.vpackssdw(ymmA, ymmB, anyptr_gpC);
+ a.vpackssdw(zmmA, zmmB, zmmC);
+ a.vpackssdw(zmmA, zmmB, anyptr_gpC);
+ a.vpacksswb(xmmA, xmmB, xmmC);
+ a.vpacksswb(xmmA, xmmB, anyptr_gpC);
+ a.vpacksswb(ymmA, ymmB, ymmC);
+ a.vpacksswb(ymmA, ymmB, anyptr_gpC);
+ a.vpacksswb(zmmA, zmmB, zmmC);
+ a.vpacksswb(zmmA, zmmB, anyptr_gpC);
+ a.vpackusdw(xmmA, xmmB, xmmC);
+ a.vpackusdw(xmmA, xmmB, anyptr_gpC);
+ a.vpackusdw(ymmA, ymmB, ymmC);
+ a.vpackusdw(ymmA, ymmB, anyptr_gpC);
+ a.vpackusdw(zmmA, zmmB, zmmC);
+ a.vpackusdw(zmmA, zmmB, anyptr_gpC);
+ a.vpackuswb(xmmA, xmmB, xmmC);
+ a.vpackuswb(xmmA, xmmB, anyptr_gpC);
+ a.vpackuswb(ymmA, ymmB, ymmC);
+ a.vpackuswb(ymmA, ymmB, anyptr_gpC);
+ a.vpackuswb(zmmA, zmmB, zmmC);
+ a.vpackuswb(zmmA, zmmB, anyptr_gpC);
+ a.vpaddb(xmmA, xmmB, xmmC);
+ a.vpaddb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddb(ymmA, ymmB, ymmC);
+ a.vpaddb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddb(zmmA, zmmB, zmmC);
+ a.vpaddb(zmmA, zmmB, anyptr_gpC);
+ a.vpaddd(xmmA, xmmB, xmmC);
+ a.vpaddd(xmmA, xmmB, anyptr_gpC);
+ a.vpaddd(ymmA, ymmB, ymmC);
+ a.vpaddd(ymmA, ymmB, anyptr_gpC);
+ a.vpaddd(zmmA, zmmB, zmmC);
+ a.vpaddd(zmmA, zmmB, anyptr_gpC);
+ a.vpaddq(xmmA, xmmB, xmmC);
+ a.vpaddq(xmmA, xmmB, anyptr_gpC);
+ a.vpaddq(ymmA, ymmB, ymmC);
+ a.vpaddq(ymmA, ymmB, anyptr_gpC);
+ a.vpaddq(zmmA, zmmB, zmmC);
+ a.vpaddq(zmmA, zmmB, anyptr_gpC);
+ a.vpaddsb(xmmA, xmmB, xmmC);
+ a.vpaddsb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddsb(ymmA, ymmB, ymmC);
+ a.vpaddsb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddsb(zmmA, zmmB, zmmC);
+ a.vpaddsb(zmmA, zmmB, anyptr_gpC);
+ a.vpaddsw(xmmA, xmmB, xmmC);
+ a.vpaddsw(xmmA, xmmB, anyptr_gpC);
+ a.vpaddsw(ymmA, ymmB, ymmC);
+ a.vpaddsw(ymmA, ymmB, anyptr_gpC);
+ a.vpaddsw(zmmA, zmmB, zmmC);
+ a.vpaddsw(zmmA, zmmB, anyptr_gpC);
+ a.vpaddusb(xmmA, xmmB, xmmC);
+ a.vpaddusb(xmmA, xmmB, anyptr_gpC);
+ a.vpaddusb(ymmA, ymmB, ymmC);
+ a.vpaddusb(ymmA, ymmB, anyptr_gpC);
+ a.vpaddusb(zmmA, zmmB, zmmC);
+ a.vpaddusb(zmmA, zmmB, anyptr_gpC);
+ a.vpaddusw(xmmA, xmmB, xmmC);
+ a.vpaddusw(xmmA, xmmB, anyptr_gpC);
+ a.vpaddusw(ymmA, ymmB, ymmC);
+ a.vpaddusw(ymmA, ymmB, anyptr_gpC);
+ a.vpaddusw(zmmA, zmmB, zmmC);
+ a.vpaddusw(zmmA, zmmB, anyptr_gpC);
+ a.vpaddw(xmmA, xmmB, xmmC);
+ a.vpaddw(xmmA, xmmB, anyptr_gpC);
+ a.vpaddw(ymmA, ymmB, ymmC);
+ a.vpaddw(ymmA, ymmB, anyptr_gpC);
+ a.vpaddw(zmmA, zmmB, zmmC);
+ a.vpaddw(zmmA, zmmB, anyptr_gpC);
+ a.vpalignr(xmmA, xmmB, xmmC, 0);
+ a.vpalignr(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpalignr(ymmA, ymmB, ymmC, 0);
+ a.vpalignr(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpalignr(zmmA, zmmB, zmmC, 0);
+ a.vpalignr(zmmA, zmmB, anyptr_gpC, 0);
+ a.vpandd(xmmA, xmmB, xmmC);
+ a.vpandd(xmmA, xmmB, anyptr_gpC);
+ a.vpandd(ymmA, ymmB, ymmC);
+ a.vpandd(ymmA, ymmB, anyptr_gpC);
+ a.vpandd(zmmA, zmmB, zmmC);
+ a.vpandd(zmmA, zmmB, anyptr_gpC);
+ a.vpandnd(xmmA, xmmB, xmmC);
+ a.vpandnd(xmmA, xmmB, anyptr_gpC);
+ a.vpandnd(ymmA, ymmB, ymmC);
+ a.vpandnd(ymmA, ymmB, anyptr_gpC);
+ a.vpandnd(zmmA, zmmB, zmmC);
+ a.vpandnd(zmmA, zmmB, anyptr_gpC);
+ a.vpandnq(xmmA, xmmB, xmmC);
+ a.vpandnq(xmmA, xmmB, anyptr_gpC);
+ a.vpandnq(ymmA, ymmB, ymmC);
+ a.vpandnq(ymmA, ymmB, anyptr_gpC);
+ a.vpandnq(zmmA, zmmB, zmmC);
+ a.vpandnq(zmmA, zmmB, anyptr_gpC);
+ a.vpandq(xmmA, xmmB, xmmC);
+ a.vpandq(xmmA, xmmB, anyptr_gpC);
+ a.vpandq(ymmA, ymmB, ymmC);
+ a.vpandq(ymmA, ymmB, anyptr_gpC);
+ a.vpandq(zmmA, zmmB, zmmC);
+ a.vpandq(zmmA, zmmB, anyptr_gpC);
+ a.vpavgb(xmmA, xmmB, xmmC);
+ a.vpavgb(xmmA, xmmB, anyptr_gpC);
+ a.vpavgb(ymmA, ymmB, ymmC);
+ a.vpavgb(ymmA, ymmB, anyptr_gpC);
+ a.vpavgb(zmmA, zmmB, zmmC);
+ a.vpavgb(zmmA, zmmB, anyptr_gpC);
+ a.vpavgw(xmmA, xmmB, xmmC);
+ a.vpavgw(xmmA, xmmB, anyptr_gpC);
+ a.vpavgw(ymmA, ymmB, ymmC);
+ a.vpavgw(ymmA, ymmB, anyptr_gpC);
+ a.vpavgw(zmmA, zmmB, zmmC);
+ a.vpavgw(zmmA, zmmB, anyptr_gpC);
+ a.vpbroadcastb(xmmA, gdB);
+ a.vpbroadcastb(xmmA, gzB);
+ a.vpbroadcastb(xmmA, xmmB);
+ a.vpbroadcastb(xmmA, anyptr_gpB);
+ a.vpbroadcastb(ymmA, gdB);
+ a.vpbroadcastb(ymmA, gzB);
+ a.vpbroadcastb(ymmA, xmmB);
+ a.vpbroadcastb(ymmA, anyptr_gpB);
+ a.vpbroadcastb(zmmA, gdB);
+ a.vpbroadcastb(zmmA, gzB);
+ a.vpbroadcastb(zmmA, xmmB);
+ a.vpbroadcastb(zmmA, anyptr_gpB);
+ a.vpbroadcastd(xmmA, gdB);
+ a.vpbroadcastd(xmmA, gzB);
+ a.vpbroadcastd(xmmA, xmmB);
+ a.vpbroadcastd(xmmA, anyptr_gpB);
+ a.vpbroadcastd(ymmA, gdB);
+ a.vpbroadcastd(ymmA, gzB);
+ a.vpbroadcastd(ymmA, xmmB);
+ a.vpbroadcastd(ymmA, anyptr_gpB);
+ a.vpbroadcastd(zmmA, gdB);
+ a.vpbroadcastd(zmmA, gzB);
+ a.vpbroadcastd(zmmA, xmmB);
+ a.vpbroadcastd(zmmA, anyptr_gpB);
+ a.vpbroadcastmb2d(xmmA, kB);
+ a.vpbroadcastmb2d(ymmA, kB);
+ a.vpbroadcastmb2d(zmmA, kB);
+ a.vpbroadcastmb2q(xmmA, kB);
+ a.vpbroadcastmb2q(ymmA, kB);
+ a.vpbroadcastmb2q(zmmA, kB);
+ if (isX64) a.vpbroadcastq(xmmA, gzB);
+ a.vpbroadcastq(xmmA, xmmB);
+ a.vpbroadcastq(xmmA, anyptr_gpB);
+ if (isX64) a.vpbroadcastq(ymmA, gzB);
+ a.vpbroadcastq(ymmA, xmmB);
+ a.vpbroadcastq(ymmA, anyptr_gpB);
+ if (isX64) a.vpbroadcastq(zmmA, gzB);
+ a.vpbroadcastq(zmmA, xmmB);
+ a.vpbroadcastq(zmmA, anyptr_gpB);
+ a.vpbroadcastw(xmmA, gdB);
+ a.vpbroadcastw(xmmA, gzB);
+ a.vpbroadcastw(xmmA, xmmB);
+ a.vpbroadcastw(xmmA, anyptr_gpB);
+ a.vpbroadcastw(ymmA, gdB);
+ a.vpbroadcastw(ymmA, gzB);
+ a.vpbroadcastw(ymmA, xmmB);
+ a.vpbroadcastw(ymmA, anyptr_gpB);
+ a.vpbroadcastw(zmmA, gdB);
+ a.vpbroadcastw(zmmA, gzB);
+ a.vpbroadcastw(zmmA, xmmB);
+ a.vpbroadcastw(zmmA, anyptr_gpB);
+ a.vpcmpb(kA, xmmB, xmmC, 0);
+ a.vpcmpb(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpb(kA, ymmB, ymmC, 0);
+ a.vpcmpb(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpb(kA, zmmB, zmmC, 0);
+ a.vpcmpb(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpd(kA, xmmB, xmmC, 0);
+ a.vpcmpd(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpd(kA, ymmB, ymmC, 0);
+ a.vpcmpd(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpd(kA, zmmB, zmmC, 0);
+ a.vpcmpd(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpeqb(kA, xmmB, xmmC);
+ a.vpcmpeqb(kA, xmmB, anyptr_gpC);
+ a.vpcmpeqb(kA, ymmB, ymmC);
+ a.vpcmpeqb(kA, ymmB, anyptr_gpC);
+ a.vpcmpeqb(kA, zmmB, zmmC);
+ a.vpcmpeqb(kA, zmmB, anyptr_gpC);
+ a.vpcmpeqd(kA, xmmB, xmmC);
+ a.vpcmpeqd(kA, xmmB, anyptr_gpC);
+ a.vpcmpeqd(kA, ymmB, ymmC);
+ a.vpcmpeqd(kA, ymmB, anyptr_gpC);
+ a.vpcmpeqd(kA, zmmB, zmmC);
+ a.vpcmpeqd(kA, zmmB, anyptr_gpC);
+ a.vpcmpeqq(kA, xmmB, xmmC);
+ a.vpcmpeqq(kA, xmmB, anyptr_gpC);
+ a.vpcmpeqq(kA, ymmB, ymmC);
+ a.vpcmpeqq(kA, ymmB, anyptr_gpC);
+ a.vpcmpeqq(kA, zmmB, zmmC);
+ a.vpcmpeqq(kA, zmmB, anyptr_gpC);
+ a.vpcmpeqw(kA, xmmB, xmmC);
+ a.vpcmpeqw(kA, xmmB, anyptr_gpC);
+ a.vpcmpeqw(kA, ymmB, ymmC);
+ a.vpcmpeqw(kA, ymmB, anyptr_gpC);
+ a.vpcmpeqw(kA, zmmB, zmmC);
+ a.vpcmpeqw(kA, zmmB, anyptr_gpC);
+ a.vpcmpgtb(kA, xmmB, xmmC);
+ a.vpcmpgtb(kA, xmmB, anyptr_gpC);
+ a.vpcmpgtb(kA, ymmB, ymmC);
+ a.vpcmpgtb(kA, ymmB, anyptr_gpC);
+ a.vpcmpgtb(kA, zmmB, zmmC);
+ a.vpcmpgtb(kA, zmmB, anyptr_gpC);
+ a.vpcmpgtd(kA, xmmB, xmmC);
+ a.vpcmpgtd(kA, xmmB, anyptr_gpC);
+ a.vpcmpgtd(kA, ymmB, ymmC);
+ a.vpcmpgtd(kA, ymmB, anyptr_gpC);
+ a.vpcmpgtd(kA, zmmB, zmmC);
+ a.vpcmpgtd(kA, zmmB, anyptr_gpC);
+ a.vpcmpgtq(kA, xmmB, xmmC);
+ a.vpcmpgtq(kA, xmmB, anyptr_gpC);
+ a.vpcmpgtq(kA, ymmB, ymmC);
+ a.vpcmpgtq(kA, ymmB, anyptr_gpC);
+ a.vpcmpgtq(kA, zmmB, zmmC);
+ a.vpcmpgtq(kA, zmmB, anyptr_gpC);
+ a.vpcmpgtw(kA, xmmB, xmmC);
+ a.vpcmpgtw(kA, xmmB, anyptr_gpC);
+ a.vpcmpgtw(kA, ymmB, ymmC);
+ a.vpcmpgtw(kA, ymmB, anyptr_gpC);
+ a.vpcmpgtw(kA, zmmB, zmmC);
+ a.vpcmpgtw(kA, zmmB, anyptr_gpC);
+ a.vpcmpq(kA, xmmB, xmmC, 0);
+ a.vpcmpq(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpq(kA, ymmB, ymmC, 0);
+ a.vpcmpq(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpq(kA, zmmB, zmmC, 0);
+ a.vpcmpq(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpub(kA, xmmB, xmmC, 0);
+ a.vpcmpub(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpub(kA, ymmB, ymmC, 0);
+ a.vpcmpub(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpub(kA, zmmB, zmmC, 0);
+ a.vpcmpub(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpud(kA, xmmB, xmmC, 0);
+ a.vpcmpud(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpud(kA, ymmB, ymmC, 0);
+ a.vpcmpud(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpud(kA, zmmB, zmmC, 0);
+ a.vpcmpud(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpuq(kA, xmmB, xmmC, 0);
+ a.vpcmpuq(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpuq(kA, ymmB, ymmC, 0);
+ a.vpcmpuq(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpuq(kA, zmmB, zmmC, 0);
+ a.vpcmpuq(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpuw(kA, xmmB, xmmC, 0);
+ a.vpcmpuw(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpuw(kA, ymmB, ymmC, 0);
+ a.vpcmpuw(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpuw(kA, zmmB, zmmC, 0);
+ a.vpcmpuw(kA, zmmB, anyptr_gpC, 0);
+ a.vpcmpw(kA, xmmB, xmmC, 0);
+ a.vpcmpw(kA, xmmB, anyptr_gpC, 0);
+ a.vpcmpw(kA, ymmB, ymmC, 0);
+ a.vpcmpw(kA, ymmB, anyptr_gpC, 0);
+ a.vpcmpw(kA, zmmB, zmmC, 0);
+ a.vpcmpw(kA, zmmB, anyptr_gpC, 0);
+ a.vpcompressd(xmmA, xmmB);
+ a.vpcompressd(anyptr_gpA, xmmB);
+ a.vpcompressd(ymmA, ymmB);
+ a.vpcompressd(anyptr_gpA, ymmB);
+ a.vpcompressd(zmmA, zmmB);
+ a.vpcompressd(anyptr_gpA, zmmB);
+ a.vpcompressq(xmmA, xmmB);
+ a.vpcompressq(anyptr_gpA, xmmB);
+ a.vpcompressq(ymmA, ymmB);
+ a.vpcompressq(anyptr_gpA, ymmB);
+ a.vpcompressq(zmmA, zmmB);
+ a.vpcompressq(anyptr_gpA, zmmB);
+ a.vpconflictd(xmmA, xmmB);
+ a.vpconflictd(xmmA, anyptr_gpB);
+ a.vpconflictd(ymmA, ymmB);
+ a.vpconflictd(ymmA, anyptr_gpB);
+ a.vpconflictd(zmmA, zmmB);
+ a.vpconflictd(zmmA, anyptr_gpB);
+ a.vpconflictq(xmmA, xmmB);
+ a.vpconflictq(xmmA, anyptr_gpB);
+ a.vpconflictq(ymmA, ymmB);
+ a.vpconflictq(ymmA, anyptr_gpB);
+ a.vpconflictq(zmmA, zmmB);
+ a.vpconflictq(zmmA, anyptr_gpB);
+ a.vpermb(xmmA, xmmB, xmmC);
+ a.vpermb(xmmA, xmmB, anyptr_gpC);
+ a.vpermb(ymmA, ymmB, ymmC);
+ a.vpermb(ymmA, ymmB, anyptr_gpC);
+ a.vpermb(zmmA, zmmB, zmmC);
+ a.vpermb(zmmA, zmmB, anyptr_gpC);
+ a.vpermd(ymmA, ymmB, ymmC);
+ a.vpermd(ymmA, ymmB, anyptr_gpC);
+ a.vpermd(zmmA, zmmB, zmmC);
+ a.vpermd(zmmA, zmmB, anyptr_gpC);
+ a.vpermi2b(xmmA, xmmB, xmmC);
+ a.vpermi2b(xmmA, xmmB, anyptr_gpC);
+ a.vpermi2b(ymmA, ymmB, ymmC);
+ a.vpermi2b(ymmA, ymmB, anyptr_gpC);
+ a.vpermi2b(zmmA, zmmB, zmmC);
+ a.vpermi2b(zmmA, zmmB, anyptr_gpC);
+ a.vpermi2d(xmmA, xmmB, xmmC);
+ a.vpermi2d(xmmA, xmmB, anyptr_gpC);
+ a.vpermi2d(ymmA, ymmB, ymmC);
+ a.vpermi2d(ymmA, ymmB, anyptr_gpC);
+ a.vpermi2d(zmmA, zmmB, zmmC);
+ a.vpermi2d(zmmA, zmmB, anyptr_gpC);
+ a.vpermi2pd(xmmA, xmmB, xmmC);
+ a.vpermi2pd(xmmA, xmmB, anyptr_gpC);
+ a.vpermi2pd(ymmA, ymmB, ymmC);
+ a.vpermi2pd(ymmA, ymmB, anyptr_gpC);
+ a.vpermi2pd(zmmA, zmmB, zmmC);
+ a.vpermi2pd(zmmA, zmmB, anyptr_gpC);
+ a.vpermi2ps(xmmA, xmmB, xmmC);
+ a.vpermi2ps(xmmA, xmmB, anyptr_gpC);
+ a.vpermi2ps(ymmA, ymmB, ymmC);
+ a.vpermi2ps(ymmA, ymmB, anyptr_gpC);
+ a.vpermi2ps(zmmA, zmmB, zmmC);
+ a.vpermi2ps(zmmA, zmmB, anyptr_gpC);
+ a.vpermi2q(xmmA, xmmB, xmmC);
+ a.vpermi2q(xmmA, xmmB, anyptr_gpC);
+ a.vpermi2q(ymmA, ymmB, ymmC);
+ a.vpermi2q(ymmA, ymmB, anyptr_gpC);
+ a.vpermi2q(zmmA, zmmB, zmmC);
+ a.vpermi2q(zmmA, zmmB, anyptr_gpC);
+ a.vpermi2w(xmmA, xmmB, xmmC);
+ a.vpermi2w(xmmA, xmmB, anyptr_gpC);
+ a.vpermi2w(ymmA, ymmB, ymmC);
+ a.vpermi2w(ymmA, ymmB, anyptr_gpC);
+ a.vpermi2w(zmmA, zmmB, zmmC);
+ a.vpermi2w(zmmA, zmmB, anyptr_gpC);
+ a.vpermilpd(xmmA, xmmB, xmmC);
+ a.vpermilpd(xmmA, xmmB, anyptr_gpC);
+ a.vpermilpd(ymmA, ymmB, ymmC);
+ a.vpermilpd(ymmA, ymmB, anyptr_gpC);
+ a.vpermilpd(zmmA, zmmB, zmmC);
+ a.vpermilpd(zmmA, zmmB, anyptr_gpC);
+ a.vpermilpd(xmmA, xmmB, 0);
+ a.vpermilpd(xmmA, anyptr_gpB, 0);
+ a.vpermilpd(ymmA, ymmB, 0);
+ a.vpermilpd(ymmA, anyptr_gpB, 0);
+ a.vpermilpd(zmmA, zmmB, 0);
+ a.vpermilpd(zmmA, anyptr_gpB, 0);
+ a.vpermilps(xmmA, xmmB, xmmC);
+ a.vpermilps(xmmA, xmmB, anyptr_gpC);
+ a.vpermilps(ymmA, ymmB, ymmC);
+ a.vpermilps(ymmA, ymmB, anyptr_gpC);
+ a.vpermilps(zmmA, zmmB, zmmC);
+ a.vpermilps(zmmA, zmmB, anyptr_gpC);
+ a.vpermilps(xmmA, xmmB, 0);
+ a.vpermilps(xmmA, anyptr_gpB, 0);
+ a.vpermilps(ymmA, ymmB, 0);
+ a.vpermilps(ymmA, anyptr_gpB, 0);
+ a.vpermilps(zmmA, zmmB, 0);
+ a.vpermilps(zmmA, anyptr_gpB, 0);
+ a.vpermq(ymmA, ymmB, ymmC);
+ a.vpermq(ymmA, ymmB, anyptr_gpC);
+ a.vpermq(zmmA, zmmB, zmmC);
+ a.vpermq(zmmA, zmmB, anyptr_gpC);
+ a.vpermq(ymmA, ymmB, 0);
+ a.vpermq(ymmA, anyptr_gpB, 0);
+ a.vpermq(zmmA, zmmB, 0);
+ a.vpermq(zmmA, anyptr_gpB, 0);
+ a.vpermt2b(xmmA, xmmB, xmmC);
+ a.vpermt2b(xmmA, xmmB, anyptr_gpC);
+ a.vpermt2b(ymmA, ymmB, ymmC);
+ a.vpermt2b(ymmA, ymmB, anyptr_gpC);
+ a.vpermt2b(zmmA, zmmB, zmmC);
+ a.vpermt2b(zmmA, zmmB, anyptr_gpC);
+ a.vpermt2d(xmmA, xmmB, xmmC);
+ a.vpermt2d(xmmA, xmmB, anyptr_gpC);
+ a.vpermt2d(ymmA, ymmB, ymmC);
+ a.vpermt2d(ymmA, ymmB, anyptr_gpC);
+ a.vpermt2d(zmmA, zmmB, zmmC);
+ a.vpermt2d(zmmA, zmmB, anyptr_gpC);
+ a.vpermt2pd(xmmA, xmmB, xmmC);
+ a.vpermt2pd(xmmA, xmmB, anyptr_gpC);
+ a.vpermt2pd(ymmA, ymmB, ymmC);
+ a.vpermt2pd(ymmA, ymmB, anyptr_gpC);
+ a.vpermt2pd(zmmA, zmmB, zmmC);
+ a.vpermt2pd(zmmA, zmmB, anyptr_gpC);
+ a.vpermt2ps(xmmA, xmmB, xmmC);
+ a.vpermt2ps(xmmA, xmmB, anyptr_gpC);
+ a.vpermt2ps(ymmA, ymmB, ymmC);
+ a.vpermt2ps(ymmA, ymmB, anyptr_gpC);
+ a.vpermt2ps(zmmA, zmmB, zmmC);
+ a.vpermt2ps(zmmA, zmmB, anyptr_gpC);
+ a.vpermt2q(xmmA, xmmB, xmmC);
+ a.vpermt2q(xmmA, xmmB, anyptr_gpC);
+ a.vpermt2q(ymmA, ymmB, ymmC);
+ a.vpermt2q(ymmA, ymmB, anyptr_gpC);
+ a.vpermt2q(zmmA, zmmB, zmmC);
+ a.vpermt2q(zmmA, zmmB, anyptr_gpC);
+ a.vpermt2w(xmmA, xmmB, xmmC);
+ a.vpermt2w(xmmA, xmmB, anyptr_gpC);
+ a.vpermt2w(ymmA, ymmB, ymmC);
+ a.vpermt2w(ymmA, ymmB, anyptr_gpC);
+ a.vpermt2w(zmmA, zmmB, zmmC);
+ a.vpermt2w(zmmA, zmmB, anyptr_gpC);
+ a.vpermw(xmmA, xmmB, xmmC);
+ a.vpermw(xmmA, xmmB, anyptr_gpC);
+ a.vpermw(ymmA, ymmB, ymmC);
+ a.vpermw(ymmA, ymmB, anyptr_gpC);
+ a.vpermw(zmmA, zmmB, zmmC);
+ a.vpermw(zmmA, zmmB, anyptr_gpC);
+ a.vpexpandd(xmmA, xmmB);
+ a.vpexpandd(xmmA, anyptr_gpB);
+ a.vpexpandd(ymmA, ymmB);
+ a.vpexpandd(ymmA, anyptr_gpB);
+ a.vpexpandd(zmmA, zmmB);
+ a.vpexpandd(zmmA, anyptr_gpB);
+ a.vpexpandq(xmmA, xmmB);
+ a.vpexpandq(xmmA, anyptr_gpB);
+ a.vpexpandq(ymmA, ymmB);
+ a.vpexpandq(ymmA, anyptr_gpB);
+ a.vpexpandq(zmmA, zmmB);
+ a.vpexpandq(zmmA, anyptr_gpB);
+ a.vpextrb(gdA, xmmB, 0);
+ a.vpextrb(anyptr_gpA, xmmB, 0);
+ a.vpextrb(gzA, xmmB, 0);
+ a.vpextrd(gdA, xmmB, 0);
+ a.vpextrd(anyptr_gpA, xmmB, 0);
+ if (isX64) a.vpextrd(gzA, xmmB, 0);
+ if (isX64) a.vpextrq(gzA, xmmB, 0);
+ a.vpextrq(anyptr_gpA, xmmB, 0);
+ a.vpextrw(gdA, xmmB, 0);
+ a.vpextrw(gzA, xmmB, 0);
+ a.vpextrw(gdA, xmmB, 0);
+ a.vpextrw(anyptr_gpA, xmmB, 0);
+ a.vpextrw(gzA, xmmB, 0);
+ a.vpgatherdd(xmmA, vx_ptr);
+ a.vpgatherdd(ymmA, vy_ptr);
+ a.vpgatherdd(zmmA, vz_ptr);
+ a.vpgatherdq(xmmA, vx_ptr);
+ a.vpgatherdq(ymmA, vy_ptr);
+ a.vpgatherdq(zmmA, vz_ptr);
+ a.vpgatherqd(xmmA, vx_ptr);
+ a.vpgatherqd(ymmA, vy_ptr);
+ a.vpgatherqd(zmmA, vz_ptr);
+ a.vpgatherqq(xmmA, vx_ptr);
+ a.vpgatherqq(ymmA, vy_ptr);
+ a.vpgatherqq(zmmA, vz_ptr);
+ a.vpinsrb(xmmA, xmmB, gdC, 0);
+ a.vpinsrb(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpinsrb(xmmA, xmmB, gzC, 0);
+ a.vpinsrd(xmmA, xmmB, gdC, 0);
+ a.vpinsrd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpinsrd(xmmA, xmmB, gzC, 0);
+ if (isX64) a.vpinsrq(xmmA, xmmB, gzC, 0);
+ a.vpinsrq(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpinsrw(xmmA, xmmB, gdC, 0);
+ a.vpinsrw(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpinsrw(xmmA, xmmB, gzC, 0);
+ a.vplzcntd(xmmA, xmmB);
+ a.vplzcntd(xmmA, anyptr_gpB);
+ a.vplzcntd(ymmA, ymmB);
+ a.vplzcntd(ymmA, anyptr_gpB);
+ a.vplzcntd(zmmA, zmmB);
+ a.vplzcntd(zmmA, anyptr_gpB);
+ a.vplzcntq(xmmA, xmmB);
+ a.vplzcntq(xmmA, anyptr_gpB);
+ a.vplzcntq(ymmA, ymmB);
+ a.vplzcntq(ymmA, anyptr_gpB);
+ a.vplzcntq(zmmA, zmmB);
+ a.vplzcntq(zmmA, anyptr_gpB);
+ a.vpmadd52huq(xmmA, xmmB, xmmC);
+ a.vpmadd52huq(xmmA, xmmB, anyptr_gpC);
+ a.vpmadd52huq(ymmA, ymmB, ymmC);
+ a.vpmadd52huq(ymmA, ymmB, anyptr_gpC);
+ a.vpmadd52huq(zmmA, zmmB, zmmC);
+ a.vpmadd52huq(zmmA, zmmB, anyptr_gpC);
+ a.vpmadd52luq(xmmA, xmmB, xmmC);
+ a.vpmadd52luq(xmmA, xmmB, anyptr_gpC);
+ a.vpmadd52luq(ymmA, ymmB, ymmC);
+ a.vpmadd52luq(ymmA, ymmB, anyptr_gpC);
+ a.vpmadd52luq(zmmA, zmmB, zmmC);
+ a.vpmadd52luq(zmmA, zmmB, anyptr_gpC);
+ a.vpmaddubsw(xmmA, xmmB, xmmC);
+ a.vpmaddubsw(xmmA, xmmB, anyptr_gpC);
+ a.vpmaddubsw(ymmA, ymmB, ymmC);
+ a.vpmaddubsw(ymmA, ymmB, anyptr_gpC);
+ a.vpmaddubsw(zmmA, zmmB, zmmC);
+ a.vpmaddubsw(zmmA, zmmB, anyptr_gpC);
+ a.vpmaddwd(xmmA, xmmB, xmmC);
+ a.vpmaddwd(xmmA, xmmB, anyptr_gpC);
+ a.vpmaddwd(ymmA, ymmB, ymmC);
+ a.vpmaddwd(ymmA, ymmB, anyptr_gpC);
+ a.vpmaddwd(zmmA, zmmB, zmmC);
+ a.vpmaddwd(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxsb(xmmA, xmmB, xmmC);
+ a.vpmaxsb(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsb(ymmA, ymmB, ymmC);
+ a.vpmaxsb(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsb(zmmA, zmmB, zmmC);
+ a.vpmaxsb(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxsd(xmmA, xmmB, xmmC);
+ a.vpmaxsd(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsd(ymmA, ymmB, ymmC);
+ a.vpmaxsd(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsd(zmmA, zmmB, zmmC);
+ a.vpmaxsd(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxsq(xmmA, xmmB, xmmC);
+ a.vpmaxsq(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsq(ymmA, ymmB, ymmC);
+ a.vpmaxsq(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsq(zmmA, zmmB, zmmC);
+ a.vpmaxsq(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxsw(xmmA, xmmB, xmmC);
+ a.vpmaxsw(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxsw(ymmA, ymmB, ymmC);
+ a.vpmaxsw(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxsw(zmmA, zmmB, zmmC);
+ a.vpmaxsw(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxub(xmmA, xmmB, xmmC);
+ a.vpmaxub(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxub(ymmA, ymmB, ymmC);
+ a.vpmaxub(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxub(zmmA, zmmB, zmmC);
+ a.vpmaxub(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxud(xmmA, xmmB, xmmC);
+ a.vpmaxud(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxud(ymmA, ymmB, ymmC);
+ a.vpmaxud(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxud(zmmA, zmmB, zmmC);
+ a.vpmaxud(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxuq(xmmA, xmmB, xmmC);
+ a.vpmaxuq(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxuq(ymmA, ymmB, ymmC);
+ a.vpmaxuq(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxuq(zmmA, zmmB, zmmC);
+ a.vpmaxuq(zmmA, zmmB, anyptr_gpC);
+ a.vpmaxuw(xmmA, xmmB, xmmC);
+ a.vpmaxuw(xmmA, xmmB, anyptr_gpC);
+ a.vpmaxuw(ymmA, ymmB, ymmC);
+ a.vpmaxuw(ymmA, ymmB, anyptr_gpC);
+ a.vpmaxuw(zmmA, zmmB, zmmC);
+ a.vpmaxuw(zmmA, zmmB, anyptr_gpC);
+ a.vpminsb(xmmA, xmmB, xmmC);
+ a.vpminsb(xmmA, xmmB, anyptr_gpC);
+ a.vpminsb(ymmA, ymmB, ymmC);
+ a.vpminsb(ymmA, ymmB, anyptr_gpC);
+ a.vpminsb(zmmA, zmmB, zmmC);
+ a.vpminsb(zmmA, zmmB, anyptr_gpC);
+ a.vpminsd(xmmA, xmmB, xmmC);
+ a.vpminsd(xmmA, xmmB, anyptr_gpC);
+ a.vpminsd(ymmA, ymmB, ymmC);
+ a.vpminsd(ymmA, ymmB, anyptr_gpC);
+ a.vpminsd(zmmA, zmmB, zmmC);
+ a.vpminsd(zmmA, zmmB, anyptr_gpC);
+ a.vpminsq(xmmA, xmmB, xmmC);
+ a.vpminsq(xmmA, xmmB, anyptr_gpC);
+ a.vpminsq(ymmA, ymmB, ymmC);
+ a.vpminsq(ymmA, ymmB, anyptr_gpC);
+ a.vpminsq(zmmA, zmmB, zmmC);
+ a.vpminsq(zmmA, zmmB, anyptr_gpC);
+ a.vpminsw(xmmA, xmmB, xmmC);
+ a.vpminsw(xmmA, xmmB, anyptr_gpC);
+ a.vpminsw(ymmA, ymmB, ymmC);
+ a.vpminsw(ymmA, ymmB, anyptr_gpC);
+ a.vpminsw(zmmA, zmmB, zmmC);
+ a.vpminsw(zmmA, zmmB, anyptr_gpC);
+ a.vpminub(xmmA, xmmB, xmmC);
+ a.vpminub(xmmA, xmmB, anyptr_gpC);
+ a.vpminub(ymmA, ymmB, ymmC);
+ a.vpminub(ymmA, ymmB, anyptr_gpC);
+ a.vpminub(zmmA, zmmB, zmmC);
+ a.vpminub(zmmA, zmmB, anyptr_gpC);
+ a.vpminud(xmmA, xmmB, xmmC);
+ a.vpminud(xmmA, xmmB, anyptr_gpC);
+ a.vpminud(ymmA, ymmB, ymmC);
+ a.vpminud(ymmA, ymmB, anyptr_gpC);
+ a.vpminud(zmmA, zmmB, zmmC);
+ a.vpminud(zmmA, zmmB, anyptr_gpC);
+ a.vpminuq(xmmA, xmmB, xmmC);
+ a.vpminuq(xmmA, xmmB, anyptr_gpC);
+ a.vpminuq(ymmA, ymmB, ymmC);
+ a.vpminuq(ymmA, ymmB, anyptr_gpC);
+ a.vpminuq(zmmA, zmmB, zmmC);
+ a.vpminuq(zmmA, zmmB, anyptr_gpC);
+ a.vpminuw(xmmA, xmmB, xmmC);
+ a.vpminuw(xmmA, xmmB, anyptr_gpC);
+ a.vpminuw(ymmA, ymmB, ymmC);
+ a.vpminuw(ymmA, ymmB, anyptr_gpC);
+ a.vpminuw(zmmA, zmmB, zmmC);
+ a.vpminuw(zmmA, zmmB, anyptr_gpC);
+ a.vpmovb2m(kA, xmmB);
+ a.vpmovb2m(kA, ymmB);
+ a.vpmovb2m(kA, zmmB);
+ a.vpmovd2m(kA, xmmB);
+ a.vpmovd2m(kA, ymmB);
+ a.vpmovd2m(kA, zmmB);
+ a.vpmovdb(xmmA, xmmB);
+ a.vpmovdb(anyptr_gpA, xmmB);
+ a.vpmovdb(xmmA, ymmB);
+ a.vpmovdb(anyptr_gpA, ymmB);
+ a.vpmovdb(xmmA, zmmB);
+ a.vpmovdb(anyptr_gpA, zmmB);
+ a.vpmovdw(xmmA, xmmB);
+ a.vpmovdw(anyptr_gpA, xmmB);
+ a.vpmovdw(xmmA, ymmB);
+ a.vpmovdw(anyptr_gpA, ymmB);
+ a.vpmovdw(ymmA, zmmB);
+ a.vpmovdw(anyptr_gpA, zmmB);
+ a.vpmovm2b(xmmA, kB);
+ a.vpmovm2b(ymmA, kB);
+ a.vpmovm2b(zmmA, kB);
+ a.vpmovm2d(xmmA, kB);
+ a.vpmovm2d(ymmA, kB);
+ a.vpmovm2d(zmmA, kB);
+ a.vpmovm2q(xmmA, kB);
+ a.vpmovm2q(ymmA, kB);
+ a.vpmovm2q(zmmA, kB);
+ a.vpmovm2w(xmmA, kB);
+ a.vpmovm2w(ymmA, kB);
+ a.vpmovm2w(zmmA, kB);
+ a.vpmovq2m(kA, xmmB);
+ a.vpmovq2m(kA, ymmB);
+ a.vpmovq2m(kA, zmmB);
+ a.vpmovqb(xmmA, xmmB);
+ a.vpmovqb(anyptr_gpA, xmmB);
+ a.vpmovqb(xmmA, ymmB);
+ a.vpmovqb(anyptr_gpA, ymmB);
+ a.vpmovqb(xmmA, zmmB);
+ a.vpmovqb(anyptr_gpA, zmmB);
+ a.vpmovqd(xmmA, xmmB);
+ a.vpmovqd(anyptr_gpA, xmmB);
+ a.vpmovqd(xmmA, ymmB);
+ a.vpmovqd(anyptr_gpA, ymmB);
+ a.vpmovqd(ymmA, zmmB);
+ a.vpmovqd(anyptr_gpA, zmmB);
+ a.vpmovqw(xmmA, xmmB);
+ a.vpmovqw(anyptr_gpA, xmmB);
+ a.vpmovqw(xmmA, ymmB);
+ a.vpmovqw(anyptr_gpA, ymmB);
+ a.vpmovqw(xmmA, zmmB);
+ a.vpmovqw(anyptr_gpA, zmmB);
+ a.vpmovsdb(xmmA, xmmB);
+ a.vpmovsdb(anyptr_gpA, xmmB);
+ a.vpmovsdb(xmmA, ymmB);
+ a.vpmovsdb(anyptr_gpA, ymmB);
+ a.vpmovsdb(xmmA, zmmB);
+ a.vpmovsdb(anyptr_gpA, zmmB);
+ a.vpmovsdw(xmmA, xmmB);
+ a.vpmovsdw(anyptr_gpA, xmmB);
+ a.vpmovsdw(xmmA, ymmB);
+ a.vpmovsdw(anyptr_gpA, ymmB);
+ a.vpmovsdw(ymmA, zmmB);
+ a.vpmovsdw(anyptr_gpA, zmmB);
+ a.vpmovsqb(xmmA, xmmB);
+ a.vpmovsqb(anyptr_gpA, xmmB);
+ a.vpmovsqb(xmmA, ymmB);
+ a.vpmovsqb(anyptr_gpA, ymmB);
+ a.vpmovsqb(xmmA, zmmB);
+ a.vpmovsqb(anyptr_gpA, zmmB);
+ a.vpmovsqd(xmmA, xmmB);
+ a.vpmovsqd(anyptr_gpA, xmmB);
+ a.vpmovsqd(xmmA, ymmB);
+ a.vpmovsqd(anyptr_gpA, ymmB);
+ a.vpmovsqd(ymmA, zmmB);
+ a.vpmovsqd(anyptr_gpA, zmmB);
+ a.vpmovsqw(xmmA, xmmB);
+ a.vpmovsqw(anyptr_gpA, xmmB);
+ a.vpmovsqw(xmmA, ymmB);
+ a.vpmovsqw(anyptr_gpA, ymmB);
+ a.vpmovsqw(xmmA, zmmB);
+ a.vpmovsqw(anyptr_gpA, zmmB);
+ a.vpmovswb(xmmA, xmmB);
+ a.vpmovswb(anyptr_gpA, xmmB);
+ a.vpmovswb(xmmA, ymmB);
+ a.vpmovswb(anyptr_gpA, ymmB);
+ a.vpmovswb(ymmA, zmmB);
+ a.vpmovswb(anyptr_gpA, zmmB);
+ a.vpmovsxbd(xmmA, xmmB);
+ a.vpmovsxbd(xmmA, anyptr_gpB);
+ a.vpmovsxbd(ymmA, xmmB);
+ a.vpmovsxbd(ymmA, anyptr_gpB);
+ a.vpmovsxbd(zmmA, xmmB);
+ a.vpmovsxbd(zmmA, anyptr_gpB);
+ a.vpmovsxbq(xmmA, xmmB);
+ a.vpmovsxbq(xmmA, anyptr_gpB);
+ a.vpmovsxbq(ymmA, xmmB);
+ a.vpmovsxbq(ymmA, anyptr_gpB);
+ a.vpmovsxbq(zmmA, xmmB);
+ a.vpmovsxbq(zmmA, anyptr_gpB);
+ a.vpmovsxbw(xmmA, xmmB);
+ a.vpmovsxbw(xmmA, anyptr_gpB);
+ a.vpmovsxbw(ymmA, xmmB);
+ a.vpmovsxbw(ymmA, anyptr_gpB);
+ a.vpmovsxbw(zmmA, ymmB);
+ a.vpmovsxbw(zmmA, anyptr_gpB);
+ a.vpmovsxdq(xmmA, xmmB);
+ a.vpmovsxdq(xmmA, anyptr_gpB);
+ a.vpmovsxdq(ymmA, xmmB);
+ a.vpmovsxdq(ymmA, anyptr_gpB);
+ a.vpmovsxdq(zmmA, xmmB);
+ a.vpmovsxdq(zmmA, anyptr_gpB);
+ a.vpmovsxwd(xmmA, xmmB);
+ a.vpmovsxwd(xmmA, anyptr_gpB);
+ a.vpmovsxwd(ymmA, xmmB);
+ a.vpmovsxwd(ymmA, anyptr_gpB);
+ a.vpmovsxwd(zmmA, ymmB);
+ a.vpmovsxwd(zmmA, anyptr_gpB);
+ a.vpmovsxwq(xmmA, xmmB);
+ a.vpmovsxwq(xmmA, anyptr_gpB);
+ a.vpmovsxwq(ymmA, xmmB);
+ a.vpmovsxwq(ymmA, anyptr_gpB);
+ a.vpmovsxwq(zmmA, xmmB);
+ a.vpmovsxwq(zmmA, anyptr_gpB);
+ a.vpmovusdb(xmmA, xmmB);
+ a.vpmovusdb(anyptr_gpA, xmmB);
+ a.vpmovusdb(xmmA, ymmB);
+ a.vpmovusdb(anyptr_gpA, ymmB);
+ a.vpmovusdb(xmmA, zmmB);
+ a.vpmovusdb(anyptr_gpA, zmmB);
+ a.vpmovusdw(xmmA, xmmB);
+ a.vpmovusdw(anyptr_gpA, xmmB);
+ a.vpmovusdw(xmmA, ymmB);
+ a.vpmovusdw(anyptr_gpA, ymmB);
+ a.vpmovusdw(ymmA, zmmB);
+ a.vpmovusdw(anyptr_gpA, zmmB);
+ a.vpmovusqb(xmmA, xmmB);
+ a.vpmovusqb(anyptr_gpA, xmmB);
+ a.vpmovusqb(xmmA, ymmB);
+ a.vpmovusqb(anyptr_gpA, ymmB);
+ a.vpmovusqb(xmmA, zmmB);
+ a.vpmovusqb(anyptr_gpA, zmmB);
+ a.vpmovusqd(xmmA, xmmB);
+ a.vpmovusqd(anyptr_gpA, xmmB);
+ a.vpmovusqd(xmmA, ymmB);
+ a.vpmovusqd(anyptr_gpA, ymmB);
+ a.vpmovusqd(ymmA, zmmB);
+ a.vpmovusqd(anyptr_gpA, zmmB);
+ a.vpmovusqw(xmmA, xmmB);
+ a.vpmovusqw(anyptr_gpA, xmmB);
+ a.vpmovusqw(xmmA, ymmB);
+ a.vpmovusqw(anyptr_gpA, ymmB);
+ a.vpmovusqw(xmmA, zmmB);
+ a.vpmovusqw(anyptr_gpA, zmmB);
+ a.vpmovuswb(xmmA, xmmB);
+ a.vpmovuswb(anyptr_gpA, xmmB);
+ a.vpmovuswb(xmmA, ymmB);
+ a.vpmovuswb(anyptr_gpA, ymmB);
+ a.vpmovuswb(ymmA, zmmB);
+ a.vpmovuswb(anyptr_gpA, zmmB);
+ a.vpmovw2m(kA, xmmB);
+ a.vpmovw2m(kA, ymmB);
+ a.vpmovw2m(kA, zmmB);
+ a.vpmovwb(xmmA, xmmB);
+ a.vpmovwb(anyptr_gpA, xmmB);
+ a.vpmovwb(xmmA, ymmB);
+ a.vpmovwb(anyptr_gpA, ymmB);
+ a.vpmovwb(ymmA, zmmB);
+ a.vpmovwb(anyptr_gpA, zmmB);
+ a.vpmovzxbd(xmmA, xmmB);
+ a.vpmovzxbd(xmmA, anyptr_gpB);
+ a.vpmovzxbd(ymmA, xmmB);
+ a.vpmovzxbd(ymmA, anyptr_gpB);
+ a.vpmovzxbd(zmmA, xmmB);
+ a.vpmovzxbd(zmmA, anyptr_gpB);
+ a.vpmovzxbq(xmmA, xmmB);
+ a.vpmovzxbq(xmmA, anyptr_gpB);
+ a.vpmovzxbq(ymmA, xmmB);
+ a.vpmovzxbq(ymmA, anyptr_gpB);
+ a.vpmovzxbq(zmmA, xmmB);
+ a.vpmovzxbq(zmmA, anyptr_gpB);
+ a.vpmovzxbw(xmmA, xmmB);
+ a.vpmovzxbw(xmmA, anyptr_gpB);
+ a.vpmovzxbw(ymmA, xmmB);
+ a.vpmovzxbw(ymmA, anyptr_gpB);
+ a.vpmovzxbw(zmmA, ymmB);
+ a.vpmovzxbw(zmmA, anyptr_gpB);
+ a.vpmovzxdq(xmmA, xmmB);
+ a.vpmovzxdq(xmmA, anyptr_gpB);
+ a.vpmovzxdq(ymmA, xmmB);
+ a.vpmovzxdq(ymmA, anyptr_gpB);
+ a.vpmovzxdq(zmmA, xmmB);
+ a.vpmovzxdq(zmmA, anyptr_gpB);
+ a.vpmovzxwd(xmmA, xmmB);
+ a.vpmovzxwd(xmmA, anyptr_gpB);
+ a.vpmovzxwd(ymmA, xmmB);
+ a.vpmovzxwd(ymmA, anyptr_gpB);
+ a.vpmovzxwd(zmmA, ymmB);
+ a.vpmovzxwd(zmmA, anyptr_gpB);
+ a.vpmovzxwq(xmmA, xmmB);
+ a.vpmovzxwq(xmmA, anyptr_gpB);
+ a.vpmovzxwq(ymmA, xmmB);
+ a.vpmovzxwq(ymmA, anyptr_gpB);
+ a.vpmovzxwq(zmmA, xmmB);
+ a.vpmovzxwq(zmmA, anyptr_gpB);
+ a.vpmuldq(xmmA, xmmB, xmmC);
+ a.vpmuldq(xmmA, xmmB, anyptr_gpC);
+ a.vpmuldq(ymmA, ymmB, ymmC);
+ a.vpmuldq(ymmA, ymmB, anyptr_gpC);
+ a.vpmuldq(zmmA, zmmB, zmmC);
+ a.vpmuldq(zmmA, zmmB, anyptr_gpC);
+ a.vpmulhrsw(xmmA, xmmB, xmmC);
+ a.vpmulhrsw(xmmA, xmmB, anyptr_gpC);
+ a.vpmulhrsw(ymmA, ymmB, ymmC);
+ a.vpmulhrsw(ymmA, ymmB, anyptr_gpC);
+ a.vpmulhrsw(zmmA, zmmB, zmmC);
+ a.vpmulhrsw(zmmA, zmmB, anyptr_gpC);
+ a.vpmulhuw(xmmA, xmmB, xmmC);
+ a.vpmulhuw(xmmA, xmmB, anyptr_gpC);
+ a.vpmulhuw(ymmA, ymmB, ymmC);
+ a.vpmulhuw(ymmA, ymmB, anyptr_gpC);
+ a.vpmulhuw(zmmA, zmmB, zmmC);
+ a.vpmulhuw(zmmA, zmmB, anyptr_gpC);
+ a.vpmulhw(xmmA, xmmB, xmmC);
+ a.vpmulhw(xmmA, xmmB, anyptr_gpC);
+ a.vpmulhw(ymmA, ymmB, ymmC);
+ a.vpmulhw(ymmA, ymmB, anyptr_gpC);
+ a.vpmulhw(zmmA, zmmB, zmmC);
+ a.vpmulhw(zmmA, zmmB, anyptr_gpC);
+ a.vpmulld(xmmA, xmmB, xmmC);
+ a.vpmulld(xmmA, xmmB, anyptr_gpC);
+ a.vpmulld(ymmA, ymmB, ymmC);
+ a.vpmulld(ymmA, ymmB, anyptr_gpC);
+ a.vpmulld(zmmA, zmmB, zmmC);
+ a.vpmulld(zmmA, zmmB, anyptr_gpC);
+ a.vpmullq(xmmA, xmmB, xmmC);
+ a.vpmullq(xmmA, xmmB, anyptr_gpC);
+ a.vpmullq(ymmA, ymmB, ymmC);
+ a.vpmullq(ymmA, ymmB, anyptr_gpC);
+ a.vpmullq(zmmA, zmmB, zmmC);
+ a.vpmullq(zmmA, zmmB, anyptr_gpC);
+ a.vpmullw(xmmA, xmmB, xmmC);
+ a.vpmullw(xmmA, xmmB, anyptr_gpC);
+ a.vpmullw(ymmA, ymmB, ymmC);
+ a.vpmullw(ymmA, ymmB, anyptr_gpC);
+ a.vpmullw(zmmA, zmmB, zmmC);
+ a.vpmullw(zmmA, zmmB, anyptr_gpC);
+ a.vpmultishiftqb(xmmA, xmmB, xmmC);
+ a.vpmultishiftqb(xmmA, xmmB, anyptr_gpC);
+ a.vpmultishiftqb(ymmA, ymmB, ymmC);
+ a.vpmultishiftqb(ymmA, ymmB, anyptr_gpC);
+ a.vpmultishiftqb(zmmA, zmmB, zmmC);
+ a.vpmultishiftqb(zmmA, zmmB, anyptr_gpC);
+ a.vpmuludq(xmmA, xmmB, xmmC);
+ a.vpmuludq(xmmA, xmmB, anyptr_gpC);
+ a.vpmuludq(ymmA, ymmB, ymmC);
+ a.vpmuludq(ymmA, ymmB, anyptr_gpC);
+ a.vpmuludq(zmmA, zmmB, zmmC);
+ a.vpmuludq(zmmA, zmmB, anyptr_gpC);
+ a.vpord(xmmA, xmmB, xmmC);
+ a.vpord(xmmA, xmmB, anyptr_gpC);
+ a.vpord(ymmA, ymmB, ymmC);
+ a.vpord(ymmA, ymmB, anyptr_gpC);
+ a.vpord(zmmA, zmmB, zmmC);
+ a.vpord(zmmA, zmmB, anyptr_gpC);
+ a.vporq(xmmA, xmmB, xmmC);
+ a.vporq(xmmA, xmmB, anyptr_gpC);
+ a.vporq(ymmA, ymmB, ymmC);
+ a.vporq(ymmA, ymmB, anyptr_gpC);
+ a.vporq(zmmA, zmmB, zmmC);
+ a.vporq(zmmA, zmmB, anyptr_gpC);
+ a.vprold(xmmA, xmmB, 0);
+ a.vprold(xmmA, anyptr_gpB, 0);
+ a.vprold(ymmA, ymmB, 0);
+ a.vprold(ymmA, anyptr_gpB, 0);
+ a.vprold(zmmA, zmmB, 0);
+ a.vprold(zmmA, anyptr_gpB, 0);
+ a.vprolq(xmmA, xmmB, 0);
+ a.vprolq(xmmA, anyptr_gpB, 0);
+ a.vprolq(ymmA, ymmB, 0);
+ a.vprolq(ymmA, anyptr_gpB, 0);
+ a.vprolq(zmmA, zmmB, 0);
+ a.vprolq(zmmA, anyptr_gpB, 0);
+ a.vprolvd(xmmA, xmmB, xmmC);
+ a.vprolvd(xmmA, xmmB, anyptr_gpC);
+ a.vprolvd(ymmA, ymmB, ymmC);
+ a.vprolvd(ymmA, ymmB, anyptr_gpC);
+ a.vprolvd(zmmA, zmmB, zmmC);
+ a.vprolvd(zmmA, zmmB, anyptr_gpC);
+ a.vprolvq(xmmA, xmmB, xmmC);
+ a.vprolvq(xmmA, xmmB, anyptr_gpC);
+ a.vprolvq(ymmA, ymmB, ymmC);
+ a.vprolvq(ymmA, ymmB, anyptr_gpC);
+ a.vprolvq(zmmA, zmmB, zmmC);
+ a.vprolvq(zmmA, zmmB, anyptr_gpC);
+ a.vprord(xmmA, xmmB, 0);
+ a.vprord(xmmA, anyptr_gpB, 0);
+ a.vprord(ymmA, ymmB, 0);
+ a.vprord(ymmA, anyptr_gpB, 0);
+ a.vprord(zmmA, zmmB, 0);
+ a.vprord(zmmA, anyptr_gpB, 0);
+ a.vprorq(xmmA, xmmB, 0);
+ a.vprorq(xmmA, anyptr_gpB, 0);
+ a.vprorq(ymmA, ymmB, 0);
+ a.vprorq(ymmA, anyptr_gpB, 0);
+ a.vprorq(zmmA, zmmB, 0);
+ a.vprorq(zmmA, anyptr_gpB, 0);
+ a.vprorvd(xmmA, xmmB, xmmC);
+ a.vprorvd(xmmA, xmmB, anyptr_gpC);
+ a.vprorvd(ymmA, ymmB, ymmC);
+ a.vprorvd(ymmA, ymmB, anyptr_gpC);
+ a.vprorvd(zmmA, zmmB, zmmC);
+ a.vprorvd(zmmA, zmmB, anyptr_gpC);
+ a.vprorvq(xmmA, xmmB, xmmC);
+ a.vprorvq(xmmA, xmmB, anyptr_gpC);
+ a.vprorvq(ymmA, ymmB, ymmC);
+ a.vprorvq(ymmA, ymmB, anyptr_gpC);
+ a.vprorvq(zmmA, zmmB, zmmC);
+ a.vprorvq(zmmA, zmmB, anyptr_gpC);
+ a.vpsadbw(xmmA, xmmB, xmmC);
+ a.vpsadbw(xmmA, xmmB, anyptr_gpC);
+ a.vpsadbw(ymmA, ymmB, ymmC);
+ a.vpsadbw(ymmA, ymmB, anyptr_gpC);
+ a.vpsadbw(zmmA, zmmB, zmmC);
+ a.vpsadbw(zmmA, zmmB, anyptr_gpC);
+ a.vpscatterdd(vx_ptr, xmmB);
+ a.vpscatterdd(vy_ptr, ymmB);
+ a.vpscatterdd(vz_ptr, zmmB);
+ a.vpscatterdq(vx_ptr, xmmB);
+ a.vpscatterdq(vy_ptr, ymmB);
+ a.vpscatterdq(vz_ptr, zmmB);
+ a.vpscatterqd(vx_ptr, xmmB);
+ a.vpscatterqd(vy_ptr, xmmB);
+ a.vpscatterqd(vz_ptr, ymmB);
+ a.vpscatterqq(vx_ptr, xmmB);
+ a.vpscatterqq(vy_ptr, ymmB);
+ a.vpscatterqq(vz_ptr, zmmB);
+ a.vpshufb(xmmA, xmmB, xmmC);
+ a.vpshufb(xmmA, xmmB, anyptr_gpC);
+ a.vpshufb(ymmA, ymmB, ymmC);
+ a.vpshufb(ymmA, ymmB, anyptr_gpC);
+ a.vpshufb(zmmA, zmmB, zmmC);
+ a.vpshufb(zmmA, zmmB, anyptr_gpC);
+ a.vpshufd(xmmA, xmmB, 0);
+ a.vpshufd(xmmA, anyptr_gpB, 0);
+ a.vpshufd(ymmA, ymmB, 0);
+ a.vpshufd(ymmA, anyptr_gpB, 0);
+ a.vpshufd(zmmA, zmmB, 0);
+ a.vpshufd(zmmA, anyptr_gpB, 0);
+ a.vpshufhw(xmmA, xmmB, 0);
+ a.vpshufhw(xmmA, anyptr_gpB, 0);
+ a.vpshufhw(ymmA, ymmB, 0);
+ a.vpshufhw(ymmA, anyptr_gpB, 0);
+ a.vpshufhw(zmmA, zmmB, 0);
+ a.vpshufhw(zmmA, anyptr_gpB, 0);
+ a.vpshuflw(xmmA, xmmB, 0);
+ a.vpshuflw(xmmA, anyptr_gpB, 0);
+ a.vpshuflw(ymmA, ymmB, 0);
+ a.vpshuflw(ymmA, anyptr_gpB, 0);
+ a.vpshuflw(zmmA, zmmB, 0);
+ a.vpshuflw(zmmA, anyptr_gpB, 0);
+ a.vpslld(xmmA, xmmB, xmmC);
+ a.vpslld(xmmA, xmmB, anyptr_gpC);
+ a.vpslld(xmmA, xmmB, 0);
+ a.vpslld(xmmA, anyptr_gpB, 0);
+ a.vpslld(ymmA, ymmB, xmmC);
+ a.vpslld(ymmA, ymmB, anyptr_gpC);
+ a.vpslld(ymmA, ymmB, 0);
+ a.vpslld(ymmA, anyptr_gpB, 0);
+ a.vpslld(zmmA, zmmB, xmmC);
+ a.vpslld(zmmA, zmmB, anyptr_gpC);
+ a.vpslld(zmmA, zmmB, 0);
+ a.vpslld(zmmA, anyptr_gpB, 0);
+ a.vpslldq(xmmA, xmmB, 0);
+ a.vpslldq(xmmA, anyptr_gpB, 0);
+ a.vpslldq(ymmA, ymmB, 0);
+ a.vpslldq(ymmA, anyptr_gpB, 0);
+ a.vpslldq(zmmA, zmmB, 0);
+ a.vpslldq(zmmA, anyptr_gpB, 0);
+ a.vpsllq(xmmA, xmmB, xmmC);
+ a.vpsllq(xmmA, xmmB, anyptr_gpC);
+ a.vpsllq(xmmA, xmmB, 0);
+ a.vpsllq(xmmA, anyptr_gpB, 0);
+ a.vpsllq(ymmA, ymmB, xmmC);
+ a.vpsllq(ymmA, ymmB, anyptr_gpC);
+ a.vpsllq(ymmA, ymmB, 0);
+ a.vpsllq(ymmA, anyptr_gpB, 0);
+ a.vpsllq(zmmA, zmmB, xmmC);
+ a.vpsllq(zmmA, zmmB, anyptr_gpC);
+ a.vpsllq(zmmA, zmmB, 0);
+ a.vpsllq(zmmA, anyptr_gpB, 0);
+ a.vpsllvd(xmmA, xmmB, xmmC);
+ a.vpsllvd(xmmA, xmmB, anyptr_gpC);
+ a.vpsllvd(ymmA, ymmB, ymmC);
+ a.vpsllvd(ymmA, ymmB, anyptr_gpC);
+ a.vpsllvd(zmmA, zmmB, zmmC);
+ a.vpsllvd(zmmA, zmmB, anyptr_gpC);
+ a.vpsllvq(xmmA, xmmB, xmmC);
+ a.vpsllvq(xmmA, xmmB, anyptr_gpC);
+ a.vpsllvq(ymmA, ymmB, ymmC);
+ a.vpsllvq(ymmA, ymmB, anyptr_gpC);
+ a.vpsllvq(zmmA, zmmB, zmmC);
+ a.vpsllvq(zmmA, zmmB, anyptr_gpC);
+ a.vpsllvw(xmmA, xmmB, xmmC);
+ a.vpsllvw(xmmA, xmmB, anyptr_gpC);
+ a.vpsllvw(ymmA, ymmB, ymmC);
+ a.vpsllvw(ymmA, ymmB, anyptr_gpC);
+ a.vpsllvw(zmmA, zmmB, zmmC);
+ a.vpsllvw(zmmA, zmmB, anyptr_gpC);
+ a.vpsllw(xmmA, xmmB, xmmC);
+ a.vpsllw(xmmA, xmmB, anyptr_gpC);
+ a.vpsllw(xmmA, xmmB, 0);
+ a.vpsllw(xmmA, anyptr_gpB, 0);
+ a.vpsllw(ymmA, ymmB, xmmC);
+ a.vpsllw(ymmA, ymmB, anyptr_gpC);
+ a.vpsllw(ymmA, ymmB, 0);
+ a.vpsllw(ymmA, anyptr_gpB, 0);
+ a.vpsllw(zmmA, zmmB, xmmC);
+ a.vpsllw(zmmA, zmmB, anyptr_gpC);
+ a.vpsllw(zmmA, zmmB, 0);
+ a.vpsllw(zmmA, anyptr_gpB, 0);
+ a.vpsrad(xmmA, xmmB, xmmC);
+ a.vpsrad(xmmA, xmmB, anyptr_gpC);
+ a.vpsrad(xmmA, xmmB, 0);
+ a.vpsrad(xmmA, anyptr_gpB, 0);
+ a.vpsrad(ymmA, ymmB, xmmC);
+ a.vpsrad(ymmA, ymmB, anyptr_gpC);
+ a.vpsrad(ymmA, ymmB, 0);
+ a.vpsrad(ymmA, anyptr_gpB, 0);
+ a.vpsrad(zmmA, zmmB, xmmC);
+ a.vpsrad(zmmA, zmmB, anyptr_gpC);
+ a.vpsrad(zmmA, zmmB, 0);
+ a.vpsrad(zmmA, anyptr_gpB, 0);
+ a.vpsraq(xmmA, xmmB, xmmC);
+ a.vpsraq(xmmA, xmmB, anyptr_gpC);
+ a.vpsraq(xmmA, xmmB, 0);
+ a.vpsraq(xmmA, anyptr_gpB, 0);
+ a.vpsraq(ymmA, ymmB, xmmC);
+ a.vpsraq(ymmA, ymmB, anyptr_gpC);
+ a.vpsraq(ymmA, ymmB, 0);
+ a.vpsraq(ymmA, anyptr_gpB, 0);
+ a.vpsraq(zmmA, zmmB, xmmC);
+ a.vpsraq(zmmA, zmmB, anyptr_gpC);
+ a.vpsraq(zmmA, zmmB, 0);
+ a.vpsraq(zmmA, anyptr_gpB, 0);
+ a.vpsravd(xmmA, xmmB, xmmC);
+ a.vpsravd(xmmA, xmmB, anyptr_gpC);
+ a.vpsravd(ymmA, ymmB, ymmC);
+ a.vpsravd(ymmA, ymmB, anyptr_gpC);
+ a.vpsravd(zmmA, zmmB, zmmC);
+ a.vpsravd(zmmA, zmmB, anyptr_gpC);
+ a.vpsravq(xmmA, xmmB, xmmC);
+ a.vpsravq(xmmA, xmmB, anyptr_gpC);
+ a.vpsravq(ymmA, ymmB, ymmC);
+ a.vpsravq(ymmA, ymmB, anyptr_gpC);
+ a.vpsravq(zmmA, zmmB, zmmC);
+ a.vpsravq(zmmA, zmmB, anyptr_gpC);
+ a.vpsravw(xmmA, xmmB, xmmC);
+ a.vpsravw(xmmA, xmmB, anyptr_gpC);
+ a.vpsravw(ymmA, ymmB, ymmC);
+ a.vpsravw(ymmA, ymmB, anyptr_gpC);
+ a.vpsravw(zmmA, zmmB, zmmC);
+ a.vpsravw(zmmA, zmmB, anyptr_gpC);
+ a.vpsraw(xmmA, xmmB, xmmC);
+ a.vpsraw(xmmA, xmmB, anyptr_gpC);
+ a.vpsraw(xmmA, xmmB, 0);
+ a.vpsraw(xmmA, anyptr_gpB, 0);
+ a.vpsraw(ymmA, ymmB, xmmC);
+ a.vpsraw(ymmA, ymmB, anyptr_gpC);
+ a.vpsraw(ymmA, ymmB, 0);
+ a.vpsraw(ymmA, anyptr_gpB, 0);
+ a.vpsraw(zmmA, zmmB, xmmC);
+ a.vpsraw(zmmA, zmmB, anyptr_gpC);
+ a.vpsraw(zmmA, zmmB, 0);
+ a.vpsraw(zmmA, anyptr_gpB, 0);
+ a.vpsrld(xmmA, xmmB, xmmC);
+ a.vpsrld(xmmA, xmmB, anyptr_gpC);
+ a.vpsrld(xmmA, xmmB, 0);
+ a.vpsrld(xmmA, anyptr_gpB, 0);
+ a.vpsrld(ymmA, ymmB, xmmC);
+ a.vpsrld(ymmA, ymmB, anyptr_gpC);
+ a.vpsrld(ymmA, ymmB, 0);
+ a.vpsrld(ymmA, anyptr_gpB, 0);
+ a.vpsrld(zmmA, zmmB, xmmC);
+ a.vpsrld(zmmA, zmmB, anyptr_gpC);
+ a.vpsrld(zmmA, zmmB, 0);
+ a.vpsrld(zmmA, anyptr_gpB, 0);
+ a.vpsrldq(xmmA, xmmB, 0);
+ a.vpsrldq(xmmA, anyptr_gpB, 0);
+ a.vpsrldq(ymmA, ymmB, 0);
+ a.vpsrldq(ymmA, anyptr_gpB, 0);
+ a.vpsrldq(zmmA, zmmB, 0);
+ a.vpsrldq(zmmA, anyptr_gpB, 0);
+ a.vpsrlq(xmmA, xmmB, xmmC);
+ a.vpsrlq(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlq(xmmA, xmmB, 0);
+ a.vpsrlq(xmmA, anyptr_gpB, 0);
+ a.vpsrlq(ymmA, ymmB, xmmC);
+ a.vpsrlq(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlq(ymmA, ymmB, 0);
+ a.vpsrlq(ymmA, anyptr_gpB, 0);
+ a.vpsrlq(zmmA, zmmB, xmmC);
+ a.vpsrlq(zmmA, zmmB, anyptr_gpC);
+ a.vpsrlq(zmmA, zmmB, 0);
+ a.vpsrlq(zmmA, anyptr_gpB, 0);
+ a.vpsrlvd(xmmA, xmmB, xmmC);
+ a.vpsrlvd(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlvd(ymmA, ymmB, ymmC);
+ a.vpsrlvd(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlvd(zmmA, zmmB, zmmC);
+ a.vpsrlvd(zmmA, zmmB, anyptr_gpC);
+ a.vpsrlvq(xmmA, xmmB, xmmC);
+ a.vpsrlvq(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlvq(ymmA, ymmB, ymmC);
+ a.vpsrlvq(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlvq(zmmA, zmmB, zmmC);
+ a.vpsrlvq(zmmA, zmmB, anyptr_gpC);
+ a.vpsrlvw(xmmA, xmmB, xmmC);
+ a.vpsrlvw(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlvw(ymmA, ymmB, ymmC);
+ a.vpsrlvw(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlvw(zmmA, zmmB, zmmC);
+ a.vpsrlvw(zmmA, zmmB, anyptr_gpC);
+ a.vpsrlw(xmmA, xmmB, xmmC);
+ a.vpsrlw(xmmA, xmmB, anyptr_gpC);
+ a.vpsrlw(xmmA, xmmB, 0);
+ a.vpsrlw(xmmA, anyptr_gpB, 0);
+ a.vpsrlw(ymmA, ymmB, xmmC);
+ a.vpsrlw(ymmA, ymmB, anyptr_gpC);
+ a.vpsrlw(ymmA, ymmB, 0);
+ a.vpsrlw(ymmA, anyptr_gpB, 0);
+ a.vpsrlw(zmmA, zmmB, xmmC);
+ a.vpsrlw(zmmA, zmmB, anyptr_gpC);
+ a.vpsrlw(zmmA, zmmB, 0);
+ a.vpsrlw(zmmA, anyptr_gpB, 0);
+ a.vpsubb(xmmA, xmmB, xmmC);
+ a.vpsubb(xmmA, xmmB, anyptr_gpC);
+ a.vpsubb(ymmA, ymmB, ymmC);
+ a.vpsubb(ymmA, ymmB, anyptr_gpC);
+ a.vpsubb(zmmA, zmmB, zmmC);
+ a.vpsubb(zmmA, zmmB, anyptr_gpC);
+ a.vpsubd(xmmA, xmmB, xmmC);
+ a.vpsubd(xmmA, xmmB, anyptr_gpC);
+ a.vpsubd(ymmA, ymmB, ymmC);
+ a.vpsubd(ymmA, ymmB, anyptr_gpC);
+ a.vpsubd(zmmA, zmmB, zmmC);
+ a.vpsubd(zmmA, zmmB, anyptr_gpC);
+ a.vpsubq(xmmA, xmmB, xmmC);
+ a.vpsubq(xmmA, xmmB, anyptr_gpC);
+ a.vpsubq(ymmA, ymmB, ymmC);
+ a.vpsubq(ymmA, ymmB, anyptr_gpC);
+ a.vpsubq(zmmA, zmmB, zmmC);
+ a.vpsubq(zmmA, zmmB, anyptr_gpC);
+ a.vpsubsb(xmmA, xmmB, xmmC);
+ a.vpsubsb(xmmA, xmmB, anyptr_gpC);
+ a.vpsubsb(ymmA, ymmB, ymmC);
+ a.vpsubsb(ymmA, ymmB, anyptr_gpC);
+ a.vpsubsb(zmmA, zmmB, zmmC);
+ a.vpsubsb(zmmA, zmmB, anyptr_gpC);
+ a.vpsubsw(xmmA, xmmB, xmmC);
+ a.vpsubsw(xmmA, xmmB, anyptr_gpC);
+ a.vpsubsw(ymmA, ymmB, ymmC);
+ a.vpsubsw(ymmA, ymmB, anyptr_gpC);
+ a.vpsubsw(zmmA, zmmB, zmmC);
+ a.vpsubsw(zmmA, zmmB, anyptr_gpC);
+ a.vpsubusb(xmmA, xmmB, xmmC);
+ a.vpsubusb(xmmA, xmmB, anyptr_gpC);
+ a.vpsubusb(ymmA, ymmB, ymmC);
+ a.vpsubusb(ymmA, ymmB, anyptr_gpC);
+ a.vpsubusb(zmmA, zmmB, zmmC);
+ a.vpsubusb(zmmA, zmmB, anyptr_gpC);
+ a.vpsubusw(xmmA, xmmB, xmmC);
+ a.vpsubusw(xmmA, xmmB, anyptr_gpC);
+ a.vpsubusw(ymmA, ymmB, ymmC);
+ a.vpsubusw(ymmA, ymmB, anyptr_gpC);
+ a.vpsubusw(zmmA, zmmB, zmmC);
+ a.vpsubusw(zmmA, zmmB, anyptr_gpC);
+ a.vpsubw(xmmA, xmmB, xmmC);
+ a.vpsubw(xmmA, xmmB, anyptr_gpC);
+ a.vpsubw(ymmA, ymmB, ymmC);
+ a.vpsubw(ymmA, ymmB, anyptr_gpC);
+ a.vpsubw(zmmA, zmmB, zmmC);
+ a.vpsubw(zmmA, zmmB, anyptr_gpC);
+ a.vpternlogd(xmmA, xmmB, xmmC, 0);
+ a.vpternlogd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpternlogd(ymmA, ymmB, ymmC, 0);
+ a.vpternlogd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpternlogd(zmmA, zmmB, zmmC, 0);
+ a.vpternlogd(zmmA, zmmB, anyptr_gpC, 0);
+ a.vpternlogq(xmmA, xmmB, xmmC, 0);
+ a.vpternlogq(xmmA, xmmB, anyptr_gpC, 0);
+ a.vpternlogq(ymmA, ymmB, ymmC, 0);
+ a.vpternlogq(ymmA, ymmB, anyptr_gpC, 0);
+ a.vpternlogq(zmmA, zmmB, zmmC, 0);
+ a.vpternlogq(zmmA, zmmB, anyptr_gpC, 0);
+ a.vptestmb(kA, xmmB, xmmC);
+ a.vptestmb(kA, xmmB, anyptr_gpC);
+ a.vptestmb(kA, ymmB, ymmC);
+ a.vptestmb(kA, ymmB, anyptr_gpC);
+ a.vptestmb(kA, zmmB, zmmC);
+ a.vptestmb(kA, zmmB, anyptr_gpC);
+ a.vptestmd(kA, xmmB, xmmC);
+ a.vptestmd(kA, xmmB, anyptr_gpC);
+ a.vptestmd(kA, ymmB, ymmC);
+ a.vptestmd(kA, ymmB, anyptr_gpC);
+ a.vptestmd(kA, zmmB, zmmC);
+ a.vptestmd(kA, zmmB, anyptr_gpC);
+ a.vptestmq(kA, xmmB, xmmC);
+ a.vptestmq(kA, xmmB, anyptr_gpC);
+ a.vptestmq(kA, ymmB, ymmC);
+ a.vptestmq(kA, ymmB, anyptr_gpC);
+ a.vptestmq(kA, zmmB, zmmC);
+ a.vptestmq(kA, zmmB, anyptr_gpC);
+ a.vptestmw(kA, xmmB, xmmC);
+ a.vptestmw(kA, xmmB, anyptr_gpC);
+ a.vptestmw(kA, ymmB, ymmC);
+ a.vptestmw(kA, ymmB, anyptr_gpC);
+ a.vptestmw(kA, zmmB, zmmC);
+ a.vptestmw(kA, zmmB, anyptr_gpC);
+ a.vptestnmb(kA, xmmB, xmmC);
+ a.vptestnmb(kA, xmmB, anyptr_gpC);
+ a.vptestnmb(kA, ymmB, ymmC);
+ a.vptestnmb(kA, ymmB, anyptr_gpC);
+ a.vptestnmb(kA, zmmB, zmmC);
+ a.vptestnmb(kA, zmmB, anyptr_gpC);
+ a.vptestnmd(kA, xmmB, xmmC);
+ a.vptestnmd(kA, xmmB, anyptr_gpC);
+ a.vptestnmd(kA, ymmB, ymmC);
+ a.vptestnmd(kA, ymmB, anyptr_gpC);
+ a.vptestnmd(kA, zmmB, zmmC);
+ a.vptestnmd(kA, zmmB, anyptr_gpC);
+ a.vptestnmq(kA, xmmB, xmmC);
+ a.vptestnmq(kA, xmmB, anyptr_gpC);
+ a.vptestnmq(kA, ymmB, ymmC);
+ a.vptestnmq(kA, ymmB, anyptr_gpC);
+ a.vptestnmq(kA, zmmB, zmmC);
+ a.vptestnmq(kA, zmmB, anyptr_gpC);
+ a.vptestnmw(kA, xmmB, xmmC);
+ a.vptestnmw(kA, xmmB, anyptr_gpC);
+ a.vptestnmw(kA, ymmB, ymmC);
+ a.vptestnmw(kA, ymmB, anyptr_gpC);
+ a.vptestnmw(kA, zmmB, zmmC);
+ a.vptestnmw(kA, zmmB, anyptr_gpC);
+ a.vpunpckhbw(xmmA, xmmB, xmmC);
+ a.vpunpckhbw(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhbw(ymmA, ymmB, ymmC);
+ a.vpunpckhbw(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhbw(zmmA, zmmB, zmmC);
+ a.vpunpckhbw(zmmA, zmmB, anyptr_gpC);
+ a.vpunpckhdq(xmmA, xmmB, xmmC);
+ a.vpunpckhdq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhdq(ymmA, ymmB, ymmC);
+ a.vpunpckhdq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhdq(zmmA, zmmB, zmmC);
+ a.vpunpckhdq(zmmA, zmmB, anyptr_gpC);
+ a.vpunpckhqdq(xmmA, xmmB, xmmC);
+ a.vpunpckhqdq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhqdq(ymmA, ymmB, ymmC);
+ a.vpunpckhqdq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhqdq(zmmA, zmmB, zmmC);
+ a.vpunpckhqdq(zmmA, zmmB, anyptr_gpC);
+ a.vpunpckhwd(xmmA, xmmB, xmmC);
+ a.vpunpckhwd(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckhwd(ymmA, ymmB, ymmC);
+ a.vpunpckhwd(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckhwd(zmmA, zmmB, zmmC);
+ a.vpunpckhwd(zmmA, zmmB, anyptr_gpC);
+ a.vpunpcklbw(xmmA, xmmB, xmmC);
+ a.vpunpcklbw(xmmA, xmmB, anyptr_gpC);
+ a.vpunpcklbw(ymmA, ymmB, ymmC);
+ a.vpunpcklbw(ymmA, ymmB, anyptr_gpC);
+ a.vpunpcklbw(zmmA, zmmB, zmmC);
+ a.vpunpcklbw(zmmA, zmmB, anyptr_gpC);
+ a.vpunpckldq(xmmA, xmmB, xmmC);
+ a.vpunpckldq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpckldq(ymmA, ymmB, ymmC);
+ a.vpunpckldq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpckldq(zmmA, zmmB, zmmC);
+ a.vpunpckldq(zmmA, zmmB, anyptr_gpC);
+ a.vpunpcklqdq(xmmA, xmmB, xmmC);
+ a.vpunpcklqdq(xmmA, xmmB, anyptr_gpC);
+ a.vpunpcklqdq(ymmA, ymmB, ymmC);
+ a.vpunpcklqdq(ymmA, ymmB, anyptr_gpC);
+ a.vpunpcklqdq(zmmA, zmmB, zmmC);
+ a.vpunpcklqdq(zmmA, zmmB, anyptr_gpC);
+ a.vpunpcklwd(xmmA, xmmB, xmmC);
+ a.vpunpcklwd(xmmA, xmmB, anyptr_gpC);
+ a.vpunpcklwd(ymmA, ymmB, ymmC);
+ a.vpunpcklwd(ymmA, ymmB, anyptr_gpC);
+ a.vpunpcklwd(zmmA, zmmB, zmmC);
+ a.vpunpcklwd(zmmA, zmmB, anyptr_gpC);
+ a.vpxord(xmmA, xmmB, xmmC);
+ a.vpxord(xmmA, xmmB, anyptr_gpC);
+ a.vpxord(ymmA, ymmB, ymmC);
+ a.vpxord(ymmA, ymmB, anyptr_gpC);
+ a.vpxord(zmmA, zmmB, zmmC);
+ a.vpxord(zmmA, zmmB, anyptr_gpC);
+ a.vpxorq(xmmA, xmmB, xmmC);
+ a.vpxorq(xmmA, xmmB, anyptr_gpC);
+ a.vpxorq(ymmA, ymmB, ymmC);
+ a.vpxorq(ymmA, ymmB, anyptr_gpC);
+ a.vpxorq(zmmA, zmmB, zmmC);
+ a.vpxorq(zmmA, zmmB, anyptr_gpC);
+ a.vrangepd(xmmA, xmmB, xmmC, 0);
+ a.vrangepd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrangepd(ymmA, ymmB, ymmC, 0);
+ a.vrangepd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vrangepd(zmmA, zmmB, zmmC, 0);
+ a.vrangepd(zmmA, zmmB, anyptr_gpC, 0);
+ a.vrangeps(xmmA, xmmB, xmmC, 0);
+ a.vrangeps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrangeps(ymmA, ymmB, ymmC, 0);
+ a.vrangeps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vrangeps(zmmA, zmmB, zmmC, 0);
+ a.vrangeps(zmmA, zmmB, anyptr_gpC, 0);
+ a.vrangesd(xmmA, xmmB, xmmC, 0);
+ a.vrangesd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrangess(xmmA, xmmB, xmmC, 0);
+ a.vrangess(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrcp14pd(xmmA, xmmB);
+ a.vrcp14pd(xmmA, anyptr_gpB);
+ a.vrcp14pd(ymmA, ymmB);
+ a.vrcp14pd(ymmA, anyptr_gpB);
+ a.vrcp14pd(zmmA, zmmB);
+ a.vrcp14pd(zmmA, anyptr_gpB);
+ a.vrcp14ps(xmmA, xmmB);
+ a.vrcp14ps(xmmA, anyptr_gpB);
+ a.vrcp14ps(ymmA, ymmB);
+ a.vrcp14ps(ymmA, anyptr_gpB);
+ a.vrcp14ps(zmmA, zmmB);
+ a.vrcp14ps(zmmA, anyptr_gpB);
+ a.vrcp14sd(xmmA, xmmB, xmmC);
+ a.vrcp14sd(xmmA, xmmB, anyptr_gpC);
+ a.vrcp14ss(xmmA, xmmB, xmmC);
+ a.vrcp14ss(xmmA, xmmB, anyptr_gpC);
+ a.vrcp28pd(zmmA, zmmB);
+ a.vrcp28pd(zmmA, anyptr_gpB);
+ a.vrcp28ps(zmmA, zmmB);
+ a.vrcp28ps(zmmA, anyptr_gpB);
+ a.vrcp28sd(xmmA, xmmB, xmmC);
+ a.vrcp28sd(xmmA, xmmB, anyptr_gpC);
+ a.vrcp28ss(xmmA, xmmB, xmmC);
+ a.vrcp28ss(xmmA, xmmB, anyptr_gpC);
+ a.vreducepd(xmmA, xmmB, 0);
+ a.vreducepd(xmmA, anyptr_gpB, 0);
+ a.vreducepd(ymmA, ymmB, 0);
+ a.vreducepd(ymmA, anyptr_gpB, 0);
+ a.vreducepd(zmmA, zmmB, 0);
+ a.vreducepd(zmmA, anyptr_gpB, 0);
+ a.vreduceps(xmmA, xmmB, 0);
+ a.vreduceps(xmmA, anyptr_gpB, 0);
+ a.vreduceps(ymmA, ymmB, 0);
+ a.vreduceps(ymmA, anyptr_gpB, 0);
+ a.vreduceps(zmmA, zmmB, 0);
+ a.vreduceps(zmmA, anyptr_gpB, 0);
+ a.vreducesd(xmmA, xmmB, xmmC, 0);
+ a.vreducesd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vreducess(xmmA, xmmB, xmmC, 0);
+ a.vreducess(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrndscalepd(xmmA, xmmB, 0);
+ a.vrndscalepd(xmmA, anyptr_gpB, 0);
+ a.vrndscalepd(ymmA, ymmB, 0);
+ a.vrndscalepd(ymmA, anyptr_gpB, 0);
+ a.vrndscalepd(zmmA, zmmB, 0);
+ a.vrndscalepd(zmmA, anyptr_gpB, 0);
+ a.vrndscaleps(xmmA, xmmB, 0);
+ a.vrndscaleps(xmmA, anyptr_gpB, 0);
+ a.vrndscaleps(ymmA, ymmB, 0);
+ a.vrndscaleps(ymmA, anyptr_gpB, 0);
+ a.vrndscaleps(zmmA, zmmB, 0);
+ a.vrndscaleps(zmmA, anyptr_gpB, 0);
+ a.vrndscalesd(xmmA, xmmB, xmmC, 0);
+ a.vrndscalesd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrndscaless(xmmA, xmmB, xmmC, 0);
+ a.vrndscaless(xmmA, xmmB, anyptr_gpC, 0);
+ a.vrsqrt14pd(xmmA, xmmB);
+ a.vrsqrt14pd(xmmA, anyptr_gpB);
+ a.vrsqrt14pd(ymmA, ymmB);
+ a.vrsqrt14pd(ymmA, anyptr_gpB);
+ a.vrsqrt14pd(zmmA, zmmB);
+ a.vrsqrt14pd(zmmA, anyptr_gpB);
+ a.vrsqrt14ps(xmmA, xmmB);
+ a.vrsqrt14ps(xmmA, anyptr_gpB);
+ a.vrsqrt14ps(ymmA, ymmB);
+ a.vrsqrt14ps(ymmA, anyptr_gpB);
+ a.vrsqrt14ps(zmmA, zmmB);
+ a.vrsqrt14ps(zmmA, anyptr_gpB);
+ a.vrsqrt14sd(xmmA, xmmB, xmmC);
+ a.vrsqrt14sd(xmmA, xmmB, anyptr_gpC);
+ a.vrsqrt14ss(xmmA, xmmB, xmmC);
+ a.vrsqrt14ss(xmmA, xmmB, anyptr_gpC);
+ a.vrsqrt28pd(zmmA, zmmB);
+ a.vrsqrt28pd(zmmA, anyptr_gpB);
+ a.vrsqrt28ps(zmmA, zmmB);
+ a.vrsqrt28ps(zmmA, anyptr_gpB);
+ a.vrsqrt28sd(xmmA, xmmB, xmmC);
+ a.vrsqrt28sd(xmmA, xmmB, anyptr_gpC);
+ a.vrsqrt28ss(xmmA, xmmB, xmmC);
+ a.vrsqrt28ss(xmmA, xmmB, anyptr_gpC);
+ a.vscalefpd(xmmA, xmmB, xmmC);
+ a.vscalefpd(xmmA, xmmB, anyptr_gpC);
+ a.vscalefpd(ymmA, ymmB, ymmC);
+ a.vscalefpd(ymmA, ymmB, anyptr_gpC);
+ a.vscalefpd(zmmA, zmmB, zmmC);
+ a.vscalefpd(zmmA, zmmB, anyptr_gpC);
+ a.vscalefps(xmmA, xmmB, xmmC);
+ a.vscalefps(xmmA, xmmB, anyptr_gpC);
+ a.vscalefps(ymmA, ymmB, ymmC);
+ a.vscalefps(ymmA, ymmB, anyptr_gpC);
+ a.vscalefps(zmmA, zmmB, zmmC);
+ a.vscalefps(zmmA, zmmB, anyptr_gpC);
+ a.vscalefsd(xmmA, xmmB, xmmC);
+ a.vscalefsd(xmmA, xmmB, anyptr_gpC);
+ a.vscalefss(xmmA, xmmB, xmmC);
+ a.vscalefss(xmmA, xmmB, anyptr_gpC);
+ a.vscatterdpd(vx_ptr, xmmB);
+ a.vscatterdpd(vx_ptr, ymmB);
+ a.vscatterdpd(vy_ptr, zmmB);
+ a.vscatterdps(vx_ptr, xmmB);
+ a.vscatterdps(vy_ptr, ymmB);
+ a.vscatterdps(vz_ptr, zmmB);
+ a.vscatterpf0dpd(vy_ptr);
+ a.vscatterpf0dps(vz_ptr);
+ a.vscatterpf0qpd(vz_ptr);
+ a.vscatterpf0qps(vz_ptr);
+ a.vscatterpf1dpd(vy_ptr);
+ a.vscatterpf1dps(vz_ptr);
+ a.vscatterpf1qpd(vz_ptr);
+ a.vscatterpf1qps(vz_ptr);
+ a.vscatterqpd(vx_ptr, xmmB);
+ a.vscatterqpd(vy_ptr, ymmB);
+ a.vscatterqpd(vz_ptr, zmmB);
+ a.vscatterqps(vx_ptr, xmmB);
+ a.vscatterqps(vy_ptr, xmmB);
+ a.vscatterqps(vz_ptr, ymmB);
+ a.vshuff32x4(ymmA, ymmB, ymmC, 0);
+ a.vshuff32x4(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshuff32x4(zmmA, zmmB, zmmC, 0);
+ a.vshuff32x4(zmmA, zmmB, anyptr_gpC, 0);
+ a.vshuff64x2(ymmA, ymmB, ymmC, 0);
+ a.vshuff64x2(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshuff64x2(zmmA, zmmB, zmmC, 0);
+ a.vshuff64x2(zmmA, zmmB, anyptr_gpC, 0);
+ a.vshufi32x4(ymmA, ymmB, ymmC, 0);
+ a.vshufi32x4(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshufi32x4(zmmA, zmmB, zmmC, 0);
+ a.vshufi32x4(zmmA, zmmB, anyptr_gpC, 0);
+ a.vshufi64x2(ymmA, ymmB, ymmC, 0);
+ a.vshufi64x2(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshufi64x2(zmmA, zmmB, zmmC, 0);
+ a.vshufi64x2(zmmA, zmmB, anyptr_gpC, 0);
+ a.vshufpd(xmmA, xmmB, xmmC, 0);
+ a.vshufpd(xmmA, xmmB, anyptr_gpC, 0);
+ a.vshufpd(ymmA, ymmB, ymmC, 0);
+ a.vshufpd(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshufpd(zmmA, zmmB, zmmC, 0);
+ a.vshufpd(zmmA, zmmB, anyptr_gpC, 0);
+ a.vshufps(xmmA, xmmB, xmmC, 0);
+ a.vshufps(xmmA, xmmB, anyptr_gpC, 0);
+ a.vshufps(ymmA, ymmB, ymmC, 0);
+ a.vshufps(ymmA, ymmB, anyptr_gpC, 0);
+ a.vshufps(zmmA, zmmB, zmmC, 0);
+ a.vshufps(zmmA, zmmB, anyptr_gpC, 0);
+ a.vsqrtpd(xmmA, xmmB);
+ a.vsqrtpd(xmmA, anyptr_gpB);
+ a.vsqrtpd(ymmA, ymmB);
+ a.vsqrtpd(ymmA, anyptr_gpB);
+ a.vsqrtpd(zmmA, zmmB);
+ a.vsqrtpd(zmmA, anyptr_gpB);
+ a.vsqrtps(xmmA, xmmB);
+ a.vsqrtps(xmmA, anyptr_gpB);
+ a.vsqrtps(ymmA, ymmB);
+ a.vsqrtps(ymmA, anyptr_gpB);
+ a.vsqrtps(zmmA, zmmB);
+ a.vsqrtps(zmmA, anyptr_gpB);
+ a.vsqrtsd(xmmA, xmmB, xmmC);
+ a.vsqrtsd(xmmA, xmmB, anyptr_gpC);
+ a.vsqrtss(xmmA, xmmB, xmmC);
+ a.vsqrtss(xmmA, xmmB, anyptr_gpC);
+ a.vsubpd(xmmA, xmmB, xmmC);
+ a.vsubpd(xmmA, xmmB, anyptr_gpC);
+ a.vsubpd(ymmA, ymmB, ymmC);
+ a.vsubpd(ymmA, ymmB, anyptr_gpC);
+ a.vsubpd(zmmA, zmmB, zmmC);
+ a.vsubpd(zmmA, zmmB, anyptr_gpC);
+ a.vsubps(xmmA, xmmB, xmmC);
+ a.vsubps(xmmA, xmmB, anyptr_gpC);
+ a.vsubps(ymmA, ymmB, ymmC);
+ a.vsubps(ymmA, ymmB, anyptr_gpC);
+ a.vsubps(zmmA, zmmB, zmmC);
+ a.vsubps(zmmA, zmmB, anyptr_gpC);
+ a.vsubsd(xmmA, xmmB, xmmC);
+ a.vsubsd(xmmA, xmmB, anyptr_gpC);
+ a.vsubss(xmmA, xmmB, xmmC);
+ a.vsubss(xmmA, xmmB, anyptr_gpC);
+ a.vucomisd(xmmA, xmmB);
+ a.vucomisd(xmmA, anyptr_gpB);
+ a.vucomiss(xmmA, xmmB);
+ a.vucomiss(xmmA, anyptr_gpB);
+ a.vunpckhpd(xmmA, xmmB, xmmC);
+ a.vunpckhpd(xmmA, xmmB, anyptr_gpC);
+ a.vunpckhpd(ymmA, ymmB, ymmC);
+ a.vunpckhpd(ymmA, ymmB, anyptr_gpC);
+ a.vunpckhpd(zmmA, zmmB, zmmC);
+ a.vunpckhpd(zmmA, zmmB, anyptr_gpC);
+ a.vunpckhps(xmmA, xmmB, xmmC);
+ a.vunpckhps(xmmA, xmmB, anyptr_gpC);
+ a.vunpckhps(ymmA, ymmB, ymmC);
+ a.vunpckhps(ymmA, ymmB, anyptr_gpC);
+ a.vunpckhps(zmmA, zmmB, zmmC);
+ a.vunpckhps(zmmA, zmmB, anyptr_gpC);
+ a.vunpcklpd(xmmA, xmmB, xmmC);
+ a.vunpcklpd(xmmA, xmmB, anyptr_gpC);
+ a.vunpcklpd(ymmA, ymmB, ymmC);
+ a.vunpcklpd(ymmA, ymmB, anyptr_gpC);
+ a.vunpcklpd(zmmA, zmmB, zmmC);
+ a.vunpcklpd(zmmA, zmmB, anyptr_gpC);
+ a.vunpcklps(xmmA, xmmB, xmmC);
+ a.vunpcklps(xmmA, xmmB, anyptr_gpC);
+ a.vunpcklps(ymmA, ymmB, ymmC);
+ a.vunpcklps(ymmA, ymmB, anyptr_gpC);
+ a.vunpcklps(zmmA, zmmB, zmmC);
+ a.vunpcklps(zmmA, zmmB, anyptr_gpC);
+ a.vxorpd(xmmA, xmmB, xmmC);
+ a.vxorpd(xmmA, xmmB, anyptr_gpC);
+ a.vxorpd(ymmA, ymmB, ymmC);
+ a.vxorpd(ymmA, ymmB, anyptr_gpC);
+ a.vxorpd(zmmA, zmmB, zmmC);
+ a.vxorpd(zmmA, zmmB, anyptr_gpC);
+ a.vxorps(xmmA, xmmB, xmmC);
+ a.vxorps(xmmA, xmmB, anyptr_gpC);
+ a.vxorps(ymmA, ymmB, ymmC);
+ a.vxorps(ymmA, ymmB, anyptr_gpC);
+ a.vxorps(zmmA, zmmB, zmmC);
+ a.vxorps(zmmA, zmmB, anyptr_gpC);
+
+ // Mark the end.
+ a.nop();
+ a.nop();
+ a.nop();
+ a.nop();
+}
+
+} // asmtest namespace
+
+// [Guard]
+#endif // _ASMJIT_TEST_OPCODE_H
diff --git a/test/asmjit_test_unit.cpp b/test/asmjit_test_unit.cpp
new file mode 100644
index 0000000..3860f5d
--- /dev/null
+++ b/test/asmjit_test_unit.cpp
@@ -0,0 +1,261 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// [Dependencies]
+#include "./asmjit.h"
+
+using namespace asmjit;
+
+// ============================================================================
+// [DumpCpu]
+// ============================================================================
+
+struct DumpCpuFeature {
+ uint32_t feature;
+ const char* name;
+};
+
+static void dumpCpuFeatures(const CpuInfo& cpu, const DumpCpuFeature* data, size_t count) {
+ for (size_t i = 0; i < count; i++)
+ if (cpu.hasFeature(data[i].feature))
+ INFO(" %s", data[i].name);
+}
+
+static void dumpCpu(void) {
+ const CpuInfo& cpu = CpuInfo::getHost();
+
+ INFO("Host CPU:");
+ INFO(" Vendor string : %s", cpu.getVendorString());
+ INFO(" Brand string : %s", cpu.getBrandString());
+ INFO(" Family : %u", cpu.getFamily());
+ INFO(" Model : %u", cpu.getModel());
+ INFO(" Stepping : %u", cpu.getStepping());
+ INFO(" HW-Threads Count : %u", cpu.getHwThreadsCount());
+ INFO("");
+
+ // --------------------------------------------------------------------------
+ // [ARM / ARM64]
+ // --------------------------------------------------------------------------
+
+#if ASMJIT_ARCH_ARM32 || ASMJIT_ARCH_ARM64
+ static const DumpCpuFeature armFeaturesList[] = {
+ { CpuInfo::kArmFeatureV6 , "ARMv6" },
+ { CpuInfo::kArmFeatureV7 , "ARMv7" },
+ { CpuInfo::kArmFeatureV8 , "ARMv8" },
+ { CpuInfo::kArmFeatureTHUMB , "THUMB" },
+ { CpuInfo::kArmFeatureTHUMB2 , "THUMBv2" },
+ { CpuInfo::kArmFeatureVFP2 , "VFPv2" },
+ { CpuInfo::kArmFeatureVFP3 , "VFPv3" },
+ { CpuInfo::kArmFeatureVFP4 , "VFPv4" },
+ { CpuInfo::kArmFeatureVFP_D32 , "VFP D32" },
+ { CpuInfo::kArmFeatureNEON , "NEON" },
+ { CpuInfo::kArmFeatureDSP , "DSP" },
+ { CpuInfo::kArmFeatureIDIV , "IDIV" },
+ { CpuInfo::kArmFeatureAES , "AES" },
+ { CpuInfo::kArmFeatureCRC32 , "CRC32" },
+ { CpuInfo::kArmFeatureSHA1 , "SHA1" },
+ { CpuInfo::kArmFeatureSHA256 , "SHA256" },
+ { CpuInfo::kArmFeatureAtomics64 , "64-bit atomics" }
+ };
+
+ INFO("ARM Features:");
+ dumpCpuFeatures(cpu, armFeaturesList, ASMJIT_ARRAY_SIZE(armFeaturesList));
+ INFO("");
+#endif
+
+ // --------------------------------------------------------------------------
+ // [X86 / X64]
+ // --------------------------------------------------------------------------
+
+#if ASMJIT_ARCH_X86 || ASMJIT_ARCH_X64
+ static const DumpCpuFeature x86FeaturesList[] = {
+ { CpuInfo::kX86FeatureNX , "NX (Non-Execute Bit)" },
+ { CpuInfo::kX86FeatureMT , "MT (Multi-Threading)" },
+ { CpuInfo::kX86FeatureRDTSC , "RDTSC" },
+ { CpuInfo::kX86FeatureRDTSCP , "RDTSCP" },
+ { CpuInfo::kX86FeatureCMOV , "CMOV" },
+ { CpuInfo::kX86FeatureCMPXCHG8B , "CMPXCHG8B" },
+ { CpuInfo::kX86FeatureCMPXCHG16B , "CMPXCHG16B" },
+ { CpuInfo::kX86FeatureCLFLUSH , "CLFLUSH" },
+ { CpuInfo::kX86FeatureCLFLUSH_OPT , "CLFLUSH_OPT" },
+ { CpuInfo::kX86FeatureCLWB , "CLWB" },
+ { CpuInfo::kX86FeaturePCOMMIT , "PCOMMIT" },
+ { CpuInfo::kX86FeaturePREFETCH , "PREFETCH" },
+ { CpuInfo::kX86FeaturePREFETCHWT1 , "PREFETCHWT1" },
+ { CpuInfo::kX86FeatureLAHF_SAHF , "LAHF/SAHF" },
+ { CpuInfo::kX86FeatureFXSR , "FXSR" },
+ { CpuInfo::kX86FeatureFXSR_OPT , "FXSR_OPT" },
+ { CpuInfo::kX86FeatureMMX , "MMX" },
+ { CpuInfo::kX86FeatureMMX2 , "MMX2" },
+ { CpuInfo::kX86Feature3DNOW , "3DNOW" },
+ { CpuInfo::kX86Feature3DNOW2 , "3DNOW2" },
+ { CpuInfo::kX86FeatureSSE , "SSE" },
+ { CpuInfo::kX86FeatureSSE2 , "SSE2" },
+ { CpuInfo::kX86FeatureSSE3 , "SSE3" },
+ { CpuInfo::kX86FeatureSSSE3 , "SSSE3" },
+ { CpuInfo::kX86FeatureSSE4A , "SSE4A" },
+ { CpuInfo::kX86FeatureSSE4_1 , "SSE4.1" },
+ { CpuInfo::kX86FeatureSSE4_2 , "SSE4.2" },
+ { CpuInfo::kX86FeatureMSSE , "Misaligned SSE" },
+ { CpuInfo::kX86FeatureMONITOR , "MONITOR/MWAIT" },
+ { CpuInfo::kX86FeatureMOVBE , "MOVBE" },
+ { CpuInfo::kX86FeaturePOPCNT , "POPCNT" },
+ { CpuInfo::kX86FeatureLZCNT , "LZCNT" },
+ { CpuInfo::kX86FeatureAESNI , "AESNI" },
+ { CpuInfo::kX86FeaturePCLMULQDQ , "PCLMULQDQ" },
+ { CpuInfo::kX86FeatureRDRAND , "RDRAND" },
+ { CpuInfo::kX86FeatureRDSEED , "RDSEED" },
+ { CpuInfo::kX86FeatureSMAP , "SMAP" },
+ { CpuInfo::kX86FeatureSMEP , "SMEP" },
+ { CpuInfo::kX86FeatureSHA , "SHA" },
+ { CpuInfo::kX86FeatureXSAVE , "XSAVE" },
+ { CpuInfo::kX86FeatureXSAVE_OS , "XSAVE (OS)" },
+ { CpuInfo::kX86FeatureAVX , "AVX" },
+ { CpuInfo::kX86FeatureAVX2 , "AVX2" },
+ { CpuInfo::kX86FeatureF16C , "F16C" },
+ { CpuInfo::kX86FeatureFMA3 , "FMA3" },
+ { CpuInfo::kX86FeatureFMA4 , "FMA4" },
+ { CpuInfo::kX86FeatureXOP , "XOP" },
+ { CpuInfo::kX86FeatureBMI , "BMI" },
+ { CpuInfo::kX86FeatureBMI2 , "BMI2" },
+ { CpuInfo::kX86FeatureADX , "ADX" },
+ { CpuInfo::kX86FeatureTBM , "TBM" },
+ { CpuInfo::kX86FeatureMPX , "MPX" },
+ { CpuInfo::kX86FeatureHLE , "HLE" },
+ { CpuInfo::kX86FeatureRTM , "RTM" },
+ { CpuInfo::kX86FeatureERMS , "ERMS" },
+ { CpuInfo::kX86FeatureFSGSBASE , "FSGSBASE" },
+ { CpuInfo::kX86FeatureAVX512_F , "AVX512F" },
+ { CpuInfo::kX86FeatureAVX512_CDI , "AVX512CDI" },
+ { CpuInfo::kX86FeatureAVX512_PFI , "AVX512PFI" },
+ { CpuInfo::kX86FeatureAVX512_ERI , "AVX512ERI" },
+ { CpuInfo::kX86FeatureAVX512_DQ , "AVX512DQ" },
+ { CpuInfo::kX86FeatureAVX512_BW , "AVX512BW" },
+ { CpuInfo::kX86FeatureAVX512_VL , "AVX512VL" },
+ { CpuInfo::kX86FeatureAVX512_IFMA , "AVX512IFMA" },
+ { CpuInfo::kX86FeatureAVX512_VBMI , "AVX512VBMI" }
+ };
+
+ INFO("X86 Specific:");
+ INFO(" Processor Type : %u", cpu.getX86ProcessorType());
+ INFO(" Brand Index : %u", cpu.getX86BrandIndex());
+ INFO(" CL Flush Cache Line : %u", cpu.getX86FlushCacheLineSize());
+ INFO(" Max logical Processors : %u", cpu.getX86MaxLogicalProcessors());
+ INFO("");
+
+ INFO("X86 Features:");
+ dumpCpuFeatures(cpu, x86FeaturesList, ASMJIT_ARRAY_SIZE(x86FeaturesList));
+ INFO("");
+#endif
+}
+
+// ============================================================================
+// [DumpSizeOf]
+// ============================================================================
+
+#define DUMP_TYPE(...) \
+ INFO(" %-26s: %u", #__VA_ARGS__, static_cast<uint32_t>(sizeof(__VA_ARGS__)))
+
+static void dumpSizeOf(void) {
+ INFO("Size of built-ins:");
+ DUMP_TYPE(int8_t);
+ DUMP_TYPE(int16_t);
+ DUMP_TYPE(int32_t);
+ DUMP_TYPE(int64_t);
+ DUMP_TYPE(int);
+ DUMP_TYPE(long);
+ DUMP_TYPE(size_t);
+ DUMP_TYPE(intptr_t);
+ DUMP_TYPE(float);
+ DUMP_TYPE(double);
+ DUMP_TYPE(void*);
+ INFO("");
+
+ INFO("Size of Base:");
+ DUMP_TYPE(Assembler);
+ DUMP_TYPE(CodeBuffer);
+ DUMP_TYPE(CodeEmitter);
+ DUMP_TYPE(CodeHolder);
+ DUMP_TYPE(ConstPool);
+ DUMP_TYPE(LabelEntry);
+ DUMP_TYPE(RelocEntry);
+ DUMP_TYPE(Runtime);
+ DUMP_TYPE(SectionEntry);
+ DUMP_TYPE(StringBuilder);
+ DUMP_TYPE(Zone);
+ DUMP_TYPE(ZoneHeap);
+ DUMP_TYPE(ZoneHash<ZoneHashNode>);
+ DUMP_TYPE(ZoneList<void*>);
+ DUMP_TYPE(ZoneVector<void*>);
+ INFO("");
+
+ INFO("Size of Operand:");
+ DUMP_TYPE(Operand);
+ DUMP_TYPE(Reg);
+ DUMP_TYPE(Mem);
+ DUMP_TYPE(Imm);
+ DUMP_TYPE(Label);
+ INFO("");
+
+ INFO("Size of Func:");
+ DUMP_TYPE(CallConv);
+ DUMP_TYPE(FuncSignature);
+ DUMP_TYPE(FuncDetail);
+ DUMP_TYPE(FuncDetail::Value);
+ DUMP_TYPE(FuncArgsMapper);
+ DUMP_TYPE(FuncArgsMapper::Value);
+ DUMP_TYPE(FuncFrameInfo);
+ DUMP_TYPE(FuncFrameLayout);
+
+ INFO("Size of CodeBuilder:");
+ DUMP_TYPE(CodeBuilder);
+ DUMP_TYPE(CBNode);
+ DUMP_TYPE(CBInst);
+ DUMP_TYPE(CBJump);
+ DUMP_TYPE(CBData);
+ DUMP_TYPE(CBAlign);
+ DUMP_TYPE(CBLabel);
+ DUMP_TYPE(CBComment);
+ DUMP_TYPE(CBSentinel);
+
+#if !defined(ASMJIT_DISABLE_COMPILER)
+ INFO("Size of CodeCompiler:");
+ DUMP_TYPE(CodeCompiler);
+ DUMP_TYPE(CCFunc);
+ DUMP_TYPE(CCFuncRet);
+ DUMP_TYPE(CCFuncCall);
+ INFO("");
+#endif // !ASMJIT_DISABLE_COMPILER
+
+#if defined(ASMJIT_BUILD_X86)
+ INFO("Size of X86-Backend:");
+ DUMP_TYPE(X86Assembler);
+#if !defined(ASMJIT_DISABLE_COMPILER)
+ DUMP_TYPE(X86Compiler);
+#endif // !ASMJIT_DISABLE_COMPILER
+ DUMP_TYPE(X86Inst);
+ DUMP_TYPE(X86Inst::CommonData);
+ DUMP_TYPE(X86Inst::ISignature);
+ DUMP_TYPE(X86Inst::OSignature);
+ INFO("");
+#endif // ASMJIT_BUILD_X86
+}
+
+#undef DUMP_TYPE
+
+// ============================================================================
+// [Main]
+// ============================================================================
+
+static void onBeforeRun(void) {
+ dumpCpu();
+ dumpSizeOf();
+}
+
+int main(int argc, const char* argv[]) {
+ INFO("AsmJit Unit-Test\n\n");
+ return BrokenAPI::run(argc, argv, onBeforeRun);
+}
diff --git a/test/asmjit_test_x86_asm.cpp b/test/asmjit_test_x86_asm.cpp
new file mode 100644
index 0000000..6b6e990
--- /dev/null
+++ b/test/asmjit_test_x86_asm.cpp
@@ -0,0 +1,95 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// [Dependencies]
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+
+#include "./asmjit.h"
+
+using namespace asmjit;
+
+// Signature of the generated function.
+typedef void (*SumIntsFunc)(int* dst, const int* a, const int* b);
+
+// This function works for both X86Assembler and X86Builder. It shows how
+// `X86Emitter` can be used to make your code more generic.
+static void makeFunc(X86Emitter* emitter) {
+ // Decide which registers will be mapped to function arguments. Try changing
+ // registers of `dst`, `src_a`, and `src_b` and see what happens in function's
+ // prolog and epilog.
+ X86Gp dst = emitter->zax();
+ X86Gp src_a = emitter->zcx();
+ X86Gp src_b = emitter->zdx();
+
+ // Decide which vector registers to use. We use these to keep the code generic,
+ // you can switch to any other registers when needed.
+ X86Xmm vec0 = x86::xmm0;
+ X86Xmm vec1 = x86::xmm1;
+
+ // Create and initialize `FuncDetail` and `FuncFrameInfo`. Both are
+ // needed to create a function and they hold different kind of data.
+ FuncDetail func;
+ func.init(FuncSignature3<void, int*, const int*, const int*>(CallConv::kIdHost));
+
+ FuncFrameInfo ffi;
+ ffi.setDirtyRegs(X86Reg::kKindVec, // Make XMM0 and XMM1 dirty. VEC kind
+ Utils::mask(0, 1)); // describes XMM|YMM|ZMM registers.
+
+ FuncArgsMapper args(&func); // Create function arguments mapper.
+ args.assignAll(dst, src_a, src_b); // Assign our registers to arguments.
+ args.updateFrameInfo(ffi); // Reflect our args in FuncFrameInfo.
+
+ FuncFrameLayout layout; // Create the FuncFrameLayout, which
+ layout.init(func, ffi); // contains metadata of prolog/epilog.
+
+ // Emit function prolog and allocate arguments to registers.
+ FuncUtils::emitProlog(emitter, layout);
+ FuncUtils::allocArgs(emitter, layout, args);
+
+ emitter->movdqu(vec0, x86::ptr(src_a)); // Load 4 ints from [src_a] to XMM0.
+ emitter->movdqu(vec1, x86::ptr(src_b)); // Load 4 ints from [src_b] to XMM1.
+ emitter->paddd(vec0, vec1); // Add 4 ints in XMM1 to XMM0.
+ emitter->movdqu(x86::ptr(dst), vec0); // Store the result to [dst].
+
+ // Emit function epilog and return.
+ FuncUtils::emitEpilog(emitter, layout);
+}
+
+int main(int argc, char* argv[]) {
+ JitRuntime rt; // Create JIT Runtime
+
+ CodeHolder code; // Create a CodeHolder.
+ code.init(rt.getCodeInfo()); // Initialize it to match `rt`.
+ X86Assembler a(&code); // Create and attach X86Assembler to `code`.
+
+ FileLogger logger(stderr);
+ code.setLogger(&logger);
+
+ makeFunc(a.asEmitter());
+
+ SumIntsFunc fn;
+ Error err = rt.add(&fn, &code); // Add the code generated to the runtime.
+ if (err) return 1; // Handle a possible error case.
+
+ // Execute the generated function.
+ int inA[4] = { 4, 3, 2, 1 };
+ int inB[4] = { 1, 5, 2, 8 };
+ int out[4];
+ fn(out, inA, inB);
+
+ // Prints {5 8 4 9}
+ printf("{%d %d %d %d}\n", out[0], out[1], out[2], out[3]);
+
+ rt.release(fn);
+
+ if (out[0] == 5 && out[1] == 8 && out[2] == 4 && out[3] == 9)
+ return 0;
+ else
+ return 1;
+}
diff --git a/test/asmjit_test_x86_cc.cpp b/test/asmjit_test_x86_cc.cpp
new file mode 100644
index 0000000..4aaee06
--- /dev/null
+++ b/test/asmjit_test_x86_cc.cpp
@@ -0,0 +1,3429 @@
+// [AsmJit]
+// Complete x86/x64 JIT and Remote Assembler for C++.
+//
+// [License]
+// Zlib - See LICENSE.md file in the package.
+
+// [Dependencies]
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+
+#include "./asmjit.h"
+#include "./asmjit_test_misc.h"
+
+using namespace asmjit;
+
+// ============================================================================
+// [MyErrorHandler]
+// ============================================================================
+
+class MyErrorHandler : public ErrorHandler {
+public:
+ virtual bool handleError(Error err, const char* message, CodeEmitter* origin) {
+ fprintf(stderr, "ERROR: %s\n", message);
+ return false;
+ }
+};
+
+// ============================================================================
+// [X86Test]
+// ============================================================================
+
+//! Interface used to test CodeCompiler.
+class X86Test {
+public:
+ X86Test(const char* name = NULL) { _name.setString(name); }
+ virtual ~X86Test() {}
+
+ ASMJIT_INLINE const char* getName() const { return _name.getData(); }
+
+ virtual void compile(X86Compiler& c) = 0;
+ virtual bool run(void* func, StringBuilder& result, StringBuilder& expect) = 0;
+
+ StringBuilder _name;
+};
+
+// ============================================================================
+// [X86TestManager]
+// ============================================================================
+
+class X86TestManager {
+public:
+ // --------------------------------------------------------------------------
+ // [Construction / Destruction]
+ // --------------------------------------------------------------------------
+
+ X86TestManager();
+ ~X86TestManager();
+
+ // --------------------------------------------------------------------------
+ // [Methods]
+ // --------------------------------------------------------------------------
+
+ inline Error add(X86Test* test) { return _tests.append(&_zoneHeap, test); }
+
+ int run();
+
+ // --------------------------------------------------------------------------
+ // [Members]
+ // --------------------------------------------------------------------------
+
+ Zone _zone;
+ ZoneHeap _zoneHeap;
+ ZoneVector<X86Test*> _tests;
+
+ int _returnCode;
+ int _binSize;
+ bool _verbose;
+ StringBuilder _output;
+};
+
+X86TestManager::X86TestManager() :
+ _zone(8096 - Zone::kZoneOverhead),
+ _zoneHeap(&_zone),
+ _returnCode(0),
+ _binSize(0),
+ _verbose(false) {}
+
+X86TestManager::~X86TestManager() {
+ size_t i;
+ size_t count = _tests.getLength();
+
+ for (i = 0; i < count; i++) {
+ X86Test* test = _tests[i];
+ delete test;
+ }
+}
+
+int X86TestManager::run() {
+ size_t i;
+ size_t count = _tests.getLength();
+
+ FILE* file = stdout;
+
+#if !defined(ASMJIT_DISABLE_LOGGING)
+ FileLogger fileLogger(file);
+ fileLogger.addOptions(Logger::kOptionBinaryForm);
+
+ StringLogger stringLogger;
+ stringLogger.addOptions(Logger::kOptionBinaryForm);
+#endif // ASMJIT_DISABLE_LOGGING
+
+ MyErrorHandler errorHandler;
+
+ for (i = 0; i < count; i++) {
+ JitRuntime runtime;
+
+ CodeHolder code;
+ code.init(runtime.getCodeInfo());
+ code.setErrorHandler(&errorHandler);
+
+#if !defined(ASMJIT_DISABLE_LOGGING)
+ if (_verbose) {
+ fprintf(file, "\n");
+ code.setLogger(&fileLogger);
+ }
+ else {
+ stringLogger.clearString();
+ code.setLogger(&stringLogger);
+ }
+#endif // ASMJIT_DISABLE_LOGGING
+
+ X86Compiler cc(&code);
+ X86Test* test = _tests[i];
+ test->compile(cc);
+
+ Error err = cc.finalize();
+ void* func;
+
+ if (err == kErrorOk)
+ err = runtime.add(&func, &code);
+ if (_verbose) fflush(file);
+
+ if (err == kErrorOk) {
+ StringBuilder result;
+ StringBuilder expect;
+
+ if (test->run(func, result, expect)) {
+ fprintf(file, "[Success] %s.\n", test->getName());
+ }
+ else {
+#if !defined(ASMJIT_DISABLE_LOGGING)
+ if (!_verbose)
+ fprintf(file, "\n%s", stringLogger.getString());
+#endif // ASMJIT_DISABLE_LOGGING
+
+ fprintf(file, "-------------------------------------------------------------------------------\n");
+ fprintf(file, "[Failure] %s.\n", test->getName());
+ fprintf(file, "-------------------------------------------------------------------------------\n");
+ fprintf(file, "Result : %s\n", result.getData());
+ fprintf(file, "Expected: %s\n", expect.getData());
+ fprintf(file, "===============================================================================\n");
+
+ _returnCode = 1;
+ }
+
+ runtime.release(func);
+ }
+ else {
+#if !defined(ASMJIT_DISABLE_LOGGING)
+ if (!_verbose)
+ fprintf(file, "%s\n", stringLogger.getString());
+#endif // ASMJIT_DISABLE_LOGGING
+
+ fprintf(file, "-------------------------------------------------------------------------------\n");
+ fprintf(file, "[Failure] %s (%s).\n", test->getName(), DebugUtils::errorAsString(err));
+ fprintf(file, "===============================================================================\n");
+
+ _returnCode = 1;
+ }
+
+ fflush(file);
+ }
+
+ fputs("\n", file);
+ fputs(_output.getData(), file);
+ fflush(file);
+
+ return _returnCode;
+}
+
+// ============================================================================
+// [X86Test_AlignBase]
+// ============================================================================
+
+class X86Test_AlignBase : public X86Test {
+public:
+ X86Test_AlignBase(uint32_t numArgs, uint32_t numVars, uint32_t alignment, bool naked) :
+ _numArgs(numArgs),
+ _numVars(numVars),
+ _alignment(alignment),
+ _naked(naked) {
+
+ _name.setFormat("[Align] NumArgs=%u NumVars=%u Alignment=%u Naked=%c",
+ numArgs, numVars, alignment, naked ? 'Y' : 'N');
+ }
+
+ static void add(X86TestManager& mgr) {
+ for (uint32_t i = 0; i <= 8; i++) {
+ for (uint32_t j = 0; j <= 4; j++) {
+ for (uint32_t a = 16; a <= 32; a += 16) {
+ mgr.add(new X86Test_AlignBase(i, j, a, false));
+ mgr.add(new X86Test_AlignBase(i, j, a, true));
+ }
+ }
+ }
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ switch (_numArgs) {
+ case 0: cc.addFunc(FuncSignature0<int>(CallConv::kIdHost)); break;
+ case 1: cc.addFunc(FuncSignature1<int, int>(CallConv::kIdHost)); break;
+ case 2: cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost)); break;
+ case 3: cc.addFunc(FuncSignature3<int, int, int, int>(CallConv::kIdHost)); break;
+ case 4: cc.addFunc(FuncSignature4<int, int, int, int, int>(CallConv::kIdHost)); break;
+ case 5: cc.addFunc(FuncSignature5<int, int, int, int, int, int>(CallConv::kIdHost)); break;
+ case 6: cc.addFunc(FuncSignature6<int, int, int, int, int, int, int>(CallConv::kIdHost)); break;
+ case 7: cc.addFunc(FuncSignature7<int, int, int, int, int, int, int, int>(CallConv::kIdHost)); break;
+ case 8: cc.addFunc(FuncSignature8<int, int, int, int, int, int, int, int, int>(CallConv::kIdHost)); break;
+ }
+
+ if (!_naked)
+ cc.getFunc()->getFrameInfo().enablePreservedFP();
+
+ X86Gp gpVar = cc.newIntPtr("gpVar");
+ X86Gp gpSum = cc.newInt32("gpSum");
+ X86Mem stack = cc.newStack(_alignment, _alignment);
+
+ // Alloc, use and spill preserved registers.
+ if (_numVars) {
+ uint32_t gpCount = cc.getGpCount();
+ uint32_t varIndex = 0;
+ uint32_t physId = 0;
+ uint32_t regMask = 0x1;
+ uint32_t preservedMask = cc.getFunc()->getDetail().getPreservedRegs(Reg::kKindGp);
+
+ do {
+ if ((preservedMask & regMask) != 0 && (physId != X86Gp::kIdSp && physId != X86Gp::kIdBp)) {
+ X86Gp tmp = cc.newInt32("gpTmp%u", physId);
+ cc.alloc(tmp, physId);
+ cc.xor_(tmp, tmp);
+ cc.spill(tmp);
+ varIndex++;
+ }
+
+ physId++;
+ regMask <<= 1;
+ } while (varIndex < _numVars && physId < gpCount);
+ }
+
+ // Do a sum of arguments to verify a possible relocation when misaligned.
+ if (_numArgs) {
+ cc.xor_(gpSum, gpSum);
+ for (uint32_t argIndex = 0; argIndex < _numArgs; argIndex++) {
+ X86Gp gpArg = cc.newInt32("gpArg%u", argIndex);
+
+ cc.setArg(argIndex, gpArg);
+ cc.add(gpSum, gpArg);
+ }
+ }
+
+ // Check alignment of xmmVar (has to be 16).
+ cc.lea(gpVar, stack);
+ cc.and_(gpVar, _alignment - 1);
+
+ // Add a sum of arguments to check whether they are correct.
+ if (_numArgs)
+ cc.or_(gpVar.r32(), gpSum);
+
+ cc.ret(gpVar);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func0)();
+ typedef int (*Func1)(int);
+ typedef int (*Func2)(int, int);
+ typedef int (*Func3)(int, int, int);
+ typedef int (*Func4)(int, int, int, int);
+ typedef int (*Func5)(int, int, int, int, int);
+ typedef int (*Func6)(int, int, int, int, int, int);
+ typedef int (*Func7)(int, int, int, int, int, int, int);
+ typedef int (*Func8)(int, int, int, int, int, int, int, int);
+
+ unsigned int resultRet = 0;
+ unsigned int expectRet = 0;
+
+ switch (_numArgs) {
+ case 0:
+ resultRet = ptr_as_func<Func0>(_func)();
+ expectRet = 0;
+ break;
+ case 1:
+ resultRet = ptr_as_func<Func1>(_func)(1);
+ expectRet = 1;
+ break;
+ case 2:
+ resultRet = ptr_as_func<Func2>(_func)(1, 2);
+ expectRet = 1 + 2;
+ break;
+ case 3:
+ resultRet = ptr_as_func<Func3>(_func)(1, 2, 3);
+ expectRet = 1 + 2 + 3;
+ break;
+ case 4:
+ resultRet = ptr_as_func<Func4>(_func)(1, 2, 3, 4);
+ expectRet = 1 + 2 + 3 + 4;
+ break;
+ case 5:
+ resultRet = ptr_as_func<Func5>(_func)(1, 2, 3, 4, 5);
+ expectRet = 1 + 2 + 3 + 4 + 5;
+ break;
+ case 6:
+ resultRet = ptr_as_func<Func6>(_func)(1, 2, 3, 4, 5, 6);
+ expectRet = 1 + 2 + 3 + 4 + 5 + 6;
+ break;
+ case 7:
+ resultRet = ptr_as_func<Func7>(_func)(1, 2, 3, 4, 5, 6, 7);
+ expectRet = 1 + 2 + 3 + 4 + 5 + 6 + 7;
+ break;
+ case 8:
+ resultRet = ptr_as_func<Func8>(_func)(1, 2, 3, 4, 5, 6, 7, 8);
+ expectRet = 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8;
+ break;
+ }
+
+ result.setFormat("ret={%u, %u}", resultRet >> 28, resultRet & 0x0FFFFFFFU);
+ expect.setFormat("ret={%u, %u}", expectRet >> 28, expectRet & 0x0FFFFFFFU);
+
+ return resultRet == expectRet;
+ }
+
+ uint32_t _numArgs;
+ uint32_t _numVars;
+ uint32_t _alignment;
+
+ bool _naked;
+};
+
+// ============================================================================
+// [X86Test_AlignNone]
+// ============================================================================
+
+class X86Test_AlignNone : public X86Test {
+public:
+ X86Test_AlignNone() : X86Test("[Align] None") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AlignNone());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<void>(CallConv::kIdHost));
+ cc.align(kAlignCode, 0);
+ cc.align(kAlignCode, 1);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ func();
+ return true;
+ }
+};
+
+// ============================================================================
+// [X86Test_JumpCross]
+// ============================================================================
+
+class X86Test_JumpCross : public X86Test {
+public:
+ X86Test_JumpCross() : X86Test("[Jump] Cross jump") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_JumpCross());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<void>(CallConv::kIdHost));
+
+ Label L1 = cc.newLabel();
+ Label L2 = cc.newLabel();
+ Label L3 = cc.newLabel();
+
+ cc.jmp(L2);
+
+ cc.bind(L1);
+ cc.jmp(L3);
+
+ cc.bind(L2);
+ cc.jmp(L1);
+
+ cc.bind(L3);
+
+ cc.ret();
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ func();
+ return true;
+ }
+};
+
+// ============================================================================
+// [X86Test_JumpMany]
+// ============================================================================
+
+class X86Test_JumpMany : public X86Test {
+public:
+ X86Test_JumpMany() : X86Test("[Jump] Many jumps") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_JumpMany());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+ for (uint32_t i = 0; i < 1000; i++) {
+ Label L = cc.newLabel();
+ cc.jmp(L);
+ cc.bind(L);
+ }
+
+ X86Gp ret = cc.newInt32("ret");
+ cc.xor_(ret, ret);
+ cc.ret(ret);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = 0;
+
+ result.setFormat("ret={%d}", resultRet);
+ expect.setFormat("ret={%d}", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_JumpUnreachable1]
+// ============================================================================
+
+class X86Test_JumpUnreachable1 : public X86Test {
+public:
+ X86Test_JumpUnreachable1() : X86Test("[Jump] Unreachable #1") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_JumpUnreachable1());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<void>(CallConv::kIdHost));
+
+ Label L_1 = cc.newLabel();
+ Label L_2 = cc.newLabel();
+ Label L_3 = cc.newLabel();
+ Label L_4 = cc.newLabel();
+ Label L_5 = cc.newLabel();
+ Label L_6 = cc.newLabel();
+ Label L_7 = cc.newLabel();
+
+ X86Gp v0 = cc.newUInt32("v0");
+ X86Gp v1 = cc.newUInt32("v1");
+
+ cc.bind(L_2);
+ cc.bind(L_3);
+
+ cc.jmp(L_1);
+
+ cc.bind(L_5);
+ cc.mov(v0, 0);
+
+ cc.bind(L_6);
+ cc.jmp(L_3);
+ cc.mov(v1, 1);
+ cc.jmp(L_1);
+
+ cc.bind(L_4);
+ cc.jmp(L_2);
+ cc.bind(L_7);
+ cc.add(v0, v1);
+
+ cc.align(kAlignCode, 16);
+ cc.bind(L_1);
+ cc.ret();
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ func();
+
+ result.appendString("ret={}");
+ expect.appendString("ret={}");
+
+ return true;
+ }
+};
+
+// ============================================================================
+// [X86Test_JumpUnreachable2]
+// ============================================================================
+
+class X86Test_JumpUnreachable2 : public X86Test {
+public:
+ X86Test_JumpUnreachable2() : X86Test("[Jump] Unreachable #2") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_JumpUnreachable2());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<void>(CallConv::kIdHost));
+
+ Label L_1 = cc.newLabel();
+ Label L_2 = cc.newLabel();
+
+ X86Gp v0 = cc.newUInt32("v0");
+ X86Gp v1 = cc.newUInt32("v1");
+
+ cc.jmp(L_1);
+ cc.bind(L_2);
+ cc.mov(v0, 1);
+ cc.mov(v1, 2);
+ cc.cmp(v0, v1);
+ cc.jz(L_2);
+ cc.jmp(L_1);
+
+ cc.bind(L_1);
+ cc.ret();
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ func();
+
+ result.appendString("ret={}");
+ expect.appendString("ret={}");
+
+ return true;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocBase]
+// ============================================================================
+
+class X86Test_AllocBase : public X86Test {
+public:
+ X86Test_AllocBase() : X86Test("[Alloc] Base") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocBase());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ X86Gp v0 = cc.newInt32("v0");
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp v2 = cc.newInt32("v2");
+ X86Gp v3 = cc.newInt32("v3");
+ X86Gp v4 = cc.newInt32("v4");
+
+ cc.xor_(v0, v0);
+
+ cc.mov(v1, 1);
+ cc.mov(v2, 2);
+ cc.mov(v3, 3);
+ cc.mov(v4, 4);
+
+ cc.add(v0, v1);
+ cc.add(v0, v2);
+ cc.add(v0, v3);
+ cc.add(v0, v4);
+
+ cc.ret(v0);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = 1 + 2 + 3 + 4;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocManual]
+// ============================================================================
+
+class X86Test_AllocManual : public X86Test {
+public:
+ X86Test_AllocManual() : X86Test("[Alloc] Manual alloc/spill") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocManual());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ X86Gp v0 = cc.newInt32("v0");
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp cnt = cc.newInt32("cnt");
+
+ cc.xor_(v0, v0);
+ cc.xor_(v1, v1);
+ cc.spill(v0);
+ cc.spill(v1);
+
+ Label L = cc.newLabel();
+ cc.mov(cnt, 32);
+ cc.bind(L);
+
+ cc.inc(v1);
+ cc.add(v0, v1);
+
+ cc.dec(cnt);
+ cc.jnz(L);
+
+ cc.ret(v0);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet =
+ 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +
+ 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 +
+ 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 +
+ 30 + 31 + 32;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocUseMem]
+// ============================================================================
+
+class X86Test_AllocUseMem : public X86Test {
+public:
+ X86Test_AllocUseMem() : X86Test("[Alloc] Alloc/use mem") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocUseMem());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp iIdx = cc.newInt32("iIdx");
+ X86Gp iEnd = cc.newInt32("iEnd");
+
+ X86Gp aIdx = cc.newInt32("aIdx");
+ X86Gp aEnd = cc.newInt32("aEnd");
+
+ Label L_1 = cc.newLabel();
+
+ cc.setArg(0, aIdx);
+ cc.setArg(1, aEnd);
+
+ cc.mov(iIdx, aIdx);
+ cc.mov(iEnd, aEnd);
+ cc.spill(iEnd);
+
+ cc.bind(L_1);
+ cc.inc(iIdx);
+ cc.cmp(iIdx, iEnd.m());
+ cc.jne(L_1);
+
+ cc.ret(iIdx);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(10, 20);
+ int expectRet = 20;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocMany1]
+// ============================================================================
+
+class X86Test_AllocMany1 : public X86Test {
+public:
+ X86Test_AllocMany1() : X86Test("[Alloc] Many #1") {}
+
+ enum { kCount = 8 };
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocMany1());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<void, int*, int*>(CallConv::kIdHost));
+
+ X86Gp a0 = cc.newIntPtr("a0");
+ X86Gp a1 = cc.newIntPtr("a1");
+
+ cc.setArg(0, a0);
+ cc.setArg(1, a1);
+
+ // Create some variables.
+ X86Gp t = cc.newInt32("t");
+ X86Gp x[kCount];
+
+ uint32_t i;
+ for (i = 0; i < kCount; i++) {
+ x[i] = cc.newInt32("x%u", i);
+ }
+
+ // Setup variables (use mov with reg/imm to se if register allocator works).
+ for (i = 0; i < kCount; i++) {
+ cc.mov(x[i], static_cast<int>(i + 1));
+ }
+
+ // Make sum (addition).
+ cc.xor_(t, t);
+ for (i = 0; i < kCount; i++) {
+ cc.add(t, x[i]);
+ }
+
+ // Store result to a given pointer in first argument.
+ cc.mov(x86::dword_ptr(a0), t);
+
+ // Clear t.
+ cc.xor_(t, t);
+
+ // Make sum (subtraction).
+ for (i = 0; i < kCount; i++) {
+ cc.sub(t, x[i]);
+ }
+
+ // Store result to a given pointer in second argument.
+ cc.mov(x86::dword_ptr(a1), t);
+
+ // End of function.
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(int*, int*);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultX;
+ int resultY;
+
+ int expectX = 36;
+ int expectY = -36;
+
+ func(&resultX, &resultY);
+
+ result.setFormat("ret={x=%d, y=%d}", resultX, resultY);
+ expect.setFormat("ret={x=%d, y=%d}", expectX, expectY);
+
+ return resultX == expectX && resultY == expectY;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocMany2]
+// ============================================================================
+
+class X86Test_AllocMany2 : public X86Test {
+public:
+ X86Test_AllocMany2() : X86Test("[Alloc] Many #2") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocMany2());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature1<void, int*>(CallConv::kIdHost));
+
+ X86Gp var[32];
+ X86Gp a = cc.newIntPtr("a");
+
+ cc.setArg(0, a);
+
+ int i;
+ for (i = 0; i < ASMJIT_ARRAY_SIZE(var); i++) {
+ var[i] = cc.newInt32("var[%d]", i);
+ }
+
+ for (i = 0; i < ASMJIT_ARRAY_SIZE(var); i++) {
+ cc.xor_(var[i], var[i]);
+ }
+
+ X86Gp v0 = cc.newInt32("v0");
+ Label L = cc.newLabel();
+
+ cc.mov(v0, 32);
+ cc.bind(L);
+
+ for (i = 0; i < ASMJIT_ARRAY_SIZE(var); i++) {
+ cc.add(var[i], i);
+ }
+
+ cc.dec(v0);
+ cc.jnz(L);
+
+ for (i = 0; i < ASMJIT_ARRAY_SIZE(var); i++) {
+ cc.mov(x86::dword_ptr(a, i * 4), var[i]);
+ }
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(int*);
+ Func func = ptr_as_func<Func>(_func);
+
+ int i;
+ int resultBuf[32];
+ int expectBuf[32];
+
+ for (i = 0; i < ASMJIT_ARRAY_SIZE(resultBuf); i++) {
+ expectBuf[i] = i * 32;
+ }
+
+ bool success = true;
+ func(resultBuf);
+
+ for (i = 0; i < ASMJIT_ARRAY_SIZE(resultBuf); i++) {
+ result.appendFormat("%d", resultBuf[i]);
+ expect.appendFormat("%d", expectBuf[1]);
+
+ success &= (resultBuf[i] == expectBuf[i]);
+ }
+
+ return success;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocImul1]
+// ============================================================================
+
+class X86Test_AllocImul1 : public X86Test {
+public:
+ X86Test_AllocImul1() : X86Test("[Alloc] IMUL #1") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocImul1());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature4<void, int*, int*, int, int>(CallConv::kIdHost));
+
+ X86Gp dstHi = cc.newIntPtr("dstHi");
+ X86Gp dstLo = cc.newIntPtr("dstLo");
+
+ X86Gp vHi = cc.newInt32("vHi");
+ X86Gp vLo = cc.newInt32("vLo");
+ X86Gp src = cc.newInt32("src");
+
+ cc.setArg(0, dstHi);
+ cc.setArg(1, dstLo);
+ cc.setArg(2, vLo);
+ cc.setArg(3, src);
+
+ cc.imul(vHi, vLo, src);
+
+ cc.mov(x86::dword_ptr(dstHi), vHi);
+ cc.mov(x86::dword_ptr(dstLo), vLo);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(int*, int*, int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int v0 = 4;
+ int v1 = 4;
+
+ int resultHi;
+ int resultLo;
+
+ int expectHi = 0;
+ int expectLo = v0 * v1;
+
+ func(&resultHi, &resultLo, v0, v1);
+
+ result.setFormat("hi=%d, lo=%d", resultHi, resultLo);
+ expect.setFormat("hi=%d, lo=%d", expectHi, expectLo);
+
+ return resultHi == expectHi && resultLo == expectLo;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocImul2]
+// ============================================================================
+
+class X86Test_AllocImul2 : public X86Test {
+public:
+ X86Test_AllocImul2() : X86Test("[Alloc] IMUL #2") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocImul2());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<void, int*, const int*>(CallConv::kIdHost));
+
+ X86Gp dst = cc.newIntPtr("dst");
+ X86Gp src = cc.newIntPtr("src");
+
+ cc.setArg(0, dst);
+ cc.setArg(1, src);
+
+ for (unsigned int i = 0; i < 4; i++) {
+ X86Gp x = cc.newInt32("x");
+ X86Gp y = cc.newInt32("y");
+ X86Gp hi = cc.newInt32("hi");
+
+ cc.mov(x, x86::dword_ptr(src, 0));
+ cc.mov(y, x86::dword_ptr(src, 4));
+
+ cc.imul(hi, x, y);
+ cc.add(x86::dword_ptr(dst, 0), hi);
+ cc.add(x86::dword_ptr(dst, 4), x);
+ }
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(int*, const int*);
+ Func func = ptr_as_func<Func>(_func);
+
+ int src[2] = { 4, 9 };
+ int resultRet[2] = { 0, 0 };
+ int expectRet[2] = { 0, (4 * 9) * 4 };
+
+ func(resultRet, src);
+
+ result.setFormat("ret={%d, %d}", resultRet[0], resultRet[1]);
+ expect.setFormat("ret={%d, %d}", expectRet[0], expectRet[1]);
+
+ return resultRet[0] == expectRet[0] && resultRet[1] == expectRet[1];
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocIdiv1]
+// ============================================================================
+
+class X86Test_AllocIdiv1 : public X86Test {
+public:
+ X86Test_AllocIdiv1() : X86Test("[Alloc] IDIV #1") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocIdiv1());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newInt32("b");
+ X86Gp dummy = cc.newInt32("dummy");
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ cc.xor_(dummy, dummy);
+ cc.idiv(dummy, a, b);
+
+ cc.ret(a);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int v0 = 2999;
+ int v1 = 245;
+
+ int resultRet = func(v0, v1);
+ int expectRet = 2999 / 245;
+
+ result.setFormat("result=%d", resultRet);
+ expect.setFormat("result=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocSetz]
+// ============================================================================
+
+class X86Test_AllocSetz : public X86Test {
+public:
+ X86Test_AllocSetz() : X86Test("[Alloc] SETZ") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocSetz());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature3<void, int, int, char*>(CallConv::kIdHost));
+
+ X86Gp src0 = cc.newInt32("src0");
+ X86Gp src1 = cc.newInt32("src1");
+ X86Gp dst0 = cc.newIntPtr("dst0");
+
+ cc.setArg(0, src0);
+ cc.setArg(1, src1);
+ cc.setArg(2, dst0);
+
+ cc.cmp(src0, src1);
+ cc.setz(x86::byte_ptr(dst0));
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(int, int, char*);
+ Func func = ptr_as_func<Func>(_func);
+
+ char resultBuf[4];
+ char expectBuf[4] = { 1, 0, 0, 1 };
+
+ func(0, 0, &resultBuf[0]); // We are expecting 1 (0 == 0).
+ func(0, 1, &resultBuf[1]); // We are expecting 0 (0 != 1).
+ func(1, 0, &resultBuf[2]); // We are expecting 0 (1 != 0).
+ func(1, 1, &resultBuf[3]); // We are expecting 1 (1 == 1).
+
+ result.setFormat("out={%d, %d, %d, %d}", resultBuf[0], resultBuf[1], resultBuf[2], resultBuf[3]);
+ expect.setFormat("out={%d, %d, %d, %d}", expectBuf[0], expectBuf[1], expectBuf[2], expectBuf[3]);
+
+ return resultBuf[0] == expectBuf[0] &&
+ resultBuf[1] == expectBuf[1] &&
+ resultBuf[2] == expectBuf[2] &&
+ resultBuf[3] == expectBuf[3] ;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocShlRor]
+// ============================================================================
+
+class X86Test_AllocShlRor : public X86Test {
+public:
+ X86Test_AllocShlRor() : X86Test("[Alloc] SHL/ROR") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocShlRor());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature4<void, int*, int, int, int>(CallConv::kIdHost));
+
+ X86Gp dst = cc.newIntPtr("dst");
+ X86Gp var = cc.newInt32("var");
+ X86Gp vShlParam = cc.newInt32("vShlParam");
+ X86Gp vRorParam = cc.newInt32("vRorParam");
+
+ cc.setArg(0, dst);
+ cc.setArg(1, var);
+ cc.setArg(2, vShlParam);
+ cc.setArg(3, vRorParam);
+
+ cc.shl(var, vShlParam);
+ cc.ror(var, vRorParam);
+
+ cc.mov(x86::dword_ptr(dst), var);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(int*, int, int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int v0 = 0x000000FF;
+
+ int resultRet;
+ int expectRet = 0x0000FF00;
+
+ func(&resultRet, v0, 16, 8);
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocGpLo]
+// ============================================================================
+
+class X86Test_AllocGpLo : public X86Test {
+public:
+ X86Test_AllocGpLo() : X86Test("[Alloc] GPB-LO") {}
+
+ enum { kCount = 32 };
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocGpLo());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature1<uint32_t, uint32_t*>(CallConv::kIdHost));
+
+ X86Gp rPtr = cc.newUIntPtr("rPtr");
+ X86Gp rSum = cc.newUInt32("rSum");
+
+ cc.setArg(0, rPtr);
+
+ X86Gp rVar[kCount];
+ uint32_t i;
+
+ for (i = 0; i < kCount; i++) {
+ rVar[i] = cc.newUInt32("rVar[%u]", i);
+ }
+
+ // Init pseudo-regs with values from our array.
+ for (i = 0; i < kCount; i++) {
+ cc.mov(rVar[i], x86::dword_ptr(rPtr, i * 4));
+ }
+
+ for (i = 2; i < kCount; i++) {
+ // Add and truncate to 8 bit; no purpose, just mess with jit.
+ cc.add (rVar[i ], rVar[i-1]);
+ cc.movzx(rVar[i ], rVar[i ].r8());
+ cc.movzx(rVar[i-2], rVar[i-1].r8());
+ cc.movzx(rVar[i-1], rVar[i-2].r8());
+ }
+
+ // Sum up all computed values.
+ cc.mov(rSum, 0);
+ for (i = 0; i < kCount; i++) {
+ cc.add(rSum, rVar[i]);
+ }
+
+ // Return the sum.
+ cc.ret(rSum);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(uint32_t*);
+ Func func = ptr_as_func<Func>(_func);
+
+ unsigned int i;
+
+ uint32_t buf[kCount];
+ uint32_t resultRet;
+ uint32_t expectRet;
+
+ expectRet = 0;
+ for (i = 0; i < kCount; i++) {
+ buf[i] = 1;
+ }
+
+ for (i = 2; i < kCount; i++) {
+ buf[i ]+= buf[i-1];
+ buf[i ] = buf[i ] & 0xFF;
+ buf[i-2] = buf[i-1] & 0xFF;
+ buf[i-1] = buf[i-2] & 0xFF;
+ }
+
+ for (i = 0; i < kCount; i++) {
+ expectRet += buf[i];
+ }
+
+ for (i = 0; i < kCount; i++) {
+ buf[i] = 1;
+ }
+ resultRet = func(buf);
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocRepMovsb]
+// ============================================================================
+
+class X86Test_AllocRepMovsb : public X86Test {
+public:
+ X86Test_AllocRepMovsb() : X86Test("[Alloc] REP MOVS") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocRepMovsb());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature3<void, void*, void*, size_t>(CallConv::kIdHost));
+
+ X86Gp dst = cc.newIntPtr("dst");
+ X86Gp src = cc.newIntPtr("src");
+ X86Gp cnt = cc.newIntPtr("cnt");
+
+ cc.setArg(0, dst);
+ cc.setArg(1, src);
+ cc.setArg(2, cnt);
+
+ cc.rep(cnt).movs(x86::byte_ptr(dst), x86::byte_ptr(src));
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void*, void*, size_t);
+ Func func = ptr_as_func<Func>(_func);
+
+ char dst[20] = { 0 };
+ char src[20] = "Hello AsmJit!";
+ func(dst, src, strlen(src) + 1);
+
+ result.setFormat("ret=\"%s\"", dst);
+ expect.setFormat("ret=\"%s\"", src);
+
+ return ::memcmp(dst, src, strlen(src) + 1) == 0;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocIfElse1]
+// ============================================================================
+
+class X86Test_AllocIfElse1 : public X86Test {
+public:
+ X86Test_AllocIfElse1() : X86Test("[Alloc] If-Else #1") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocIfElse1());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp v2 = cc.newInt32("v2");
+
+ Label L_1 = cc.newLabel();
+ Label L_2 = cc.newLabel();
+
+ cc.setArg(0, v1);
+ cc.setArg(1, v2);
+
+ cc.cmp(v1, v2);
+ cc.jg(L_1);
+
+ cc.mov(v1, 1);
+ cc.jmp(L_2);
+
+ cc.bind(L_1);
+ cc.mov(v1, 2);
+
+ cc.bind(L_2);
+ cc.ret(v1);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int a = func(0, 1);
+ int b = func(1, 0);
+
+ result.appendFormat("ret={%d, %d}", a, b);
+ result.appendFormat("ret={%d, %d}", 1, 2);
+
+ return a == 1 && b == 2;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocIfElse2]
+// ============================================================================
+
+class X86Test_AllocIfElse2 : public X86Test {
+public:
+ X86Test_AllocIfElse2() : X86Test("[Alloc] If-Else #2") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocIfElse2());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp v2 = cc.newInt32("v2");
+
+ Label L_1 = cc.newLabel();
+ Label L_2 = cc.newLabel();
+ Label L_3 = cc.newLabel();
+ Label L_4 = cc.newLabel();
+
+ cc.setArg(0, v1);
+ cc.setArg(1, v2);
+
+ cc.jmp(L_1);
+ cc.bind(L_2);
+ cc.jmp(L_4);
+ cc.bind(L_1);
+
+ cc.cmp(v1, v2);
+ cc.jg(L_3);
+
+ cc.mov(v1, 1);
+ cc.jmp(L_2);
+
+ cc.bind(L_3);
+ cc.mov(v1, 2);
+ cc.jmp(L_2);
+
+ cc.bind(L_4);
+
+ cc.ret(v1);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int a = func(0, 1);
+ int b = func(1, 0);
+
+ result.appendFormat("ret={%d, %d}", a, b);
+ result.appendFormat("ret={%d, %d}", 1, 2);
+
+ return a == 1 && b == 2;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocIfElse3]
+// ============================================================================
+
+class X86Test_AllocIfElse3 : public X86Test {
+public:
+ X86Test_AllocIfElse3() : X86Test("[Alloc] If-Else #3") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocIfElse3());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp v2 = cc.newInt32("v2");
+ X86Gp counter = cc.newInt32("counter");
+
+ Label L_1 = cc.newLabel();
+ Label L_Loop = cc.newLabel();
+ Label L_Exit = cc.newLabel();
+
+ cc.setArg(0, v1);
+ cc.setArg(1, v2);
+
+ cc.cmp(v1, v2);
+ cc.jg(L_1);
+
+ cc.mov(counter, 0);
+
+ cc.bind(L_Loop);
+ cc.mov(v1, counter);
+
+ cc.inc(counter);
+ cc.cmp(counter, 1);
+ cc.jle(L_Loop);
+ cc.jmp(L_Exit);
+
+ cc.bind(L_1);
+ cc.mov(v1, 2);
+
+ cc.bind(L_Exit);
+ cc.ret(v1);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int a = func(0, 1);
+ int b = func(1, 0);
+
+ result.appendFormat("ret={%d, %d}", a, b);
+ result.appendFormat("ret={%d, %d}", 1, 2);
+
+ return a == 1 && b == 2;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocIfElse4]
+// ============================================================================
+
+class X86Test_AllocIfElse4 : public X86Test {
+public:
+ X86Test_AllocIfElse4() : X86Test("[Alloc] If-Else #4") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocIfElse4());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp v2 = cc.newInt32("v2");
+ X86Gp counter = cc.newInt32("counter");
+
+ Label L_1 = cc.newLabel();
+ Label L_Loop1 = cc.newLabel();
+ Label L_Loop2 = cc.newLabel();
+ Label L_Exit = cc.newLabel();
+
+ cc.mov(counter, 0);
+
+ cc.setArg(0, v1);
+ cc.setArg(1, v2);
+
+ cc.cmp(v1, v2);
+ cc.jg(L_1);
+
+ cc.bind(L_Loop1);
+ cc.mov(v1, counter);
+
+ cc.inc(counter);
+ cc.cmp(counter, 1);
+ cc.jle(L_Loop1);
+ cc.jmp(L_Exit);
+
+ cc.bind(L_1);
+ cc.bind(L_Loop2);
+ cc.mov(v1, counter);
+ cc.inc(counter);
+ cc.cmp(counter, 2);
+ cc.jle(L_Loop2);
+
+ cc.bind(L_Exit);
+ cc.ret(v1);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int a = func(0, 1);
+ int b = func(1, 0);
+
+ result.appendFormat("ret={%d, %d}", a, b);
+ result.appendFormat("ret={%d, %d}", 1, 2);
+
+ return a == 1 && b == 2;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocInt8]
+// ============================================================================
+
+class X86Test_AllocInt8 : public X86Test {
+public:
+ X86Test_AllocInt8() : X86Test("[Alloc] Int8") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocInt8());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp x = cc.newInt8("x");
+ X86Gp y = cc.newInt32("y");
+
+ cc.addFunc(FuncSignature1<int, char>(CallConv::kIdHost));
+ cc.setArg(0, x);
+
+ cc.movsx(y, x);
+
+ cc.ret(y);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(char);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(-13);
+ int expectRet = -13;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocArgsIntPtr]
+// ============================================================================
+
+class X86Test_AllocArgsIntPtr : public X86Test {
+public:
+ X86Test_AllocArgsIntPtr() : X86Test("[Alloc] Args IntPtr") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocArgsIntPtr());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature8<void, void*, void*, void*, void*, void*, void*, void*, void*>(CallConv::kIdHost));
+
+ uint32_t i;
+ X86Gp var[8];
+
+ for (i = 0; i < 8; i++) {
+ var[i] = cc.newIntPtr("var%u", i);
+ cc.setArg(i, var[i]);
+ }
+
+ for (i = 0; i < 8; i++) {
+ cc.add(var[i], static_cast<int>(i + 1));
+ }
+
+ // Move some data into buffer provided by arguments so we can verify if it
+ // really works without looking into assembler output.
+ for (i = 0; i < 8; i++) {
+ cc.add(x86::byte_ptr(var[i]), static_cast<int>(i + 1));
+ }
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void*, void*, void*, void*, void*, void*, void*, void*);
+ Func func = ptr_as_func<Func>(_func);
+
+ uint8_t resultBuf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ uint8_t expectBuf[9] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+ func(resultBuf, resultBuf, resultBuf, resultBuf,
+ resultBuf, resultBuf, resultBuf, resultBuf);
+
+ result.setFormat("buf={%d, %d, %d, %d, %d, %d, %d, %d, %d}",
+ resultBuf[0], resultBuf[1], resultBuf[2], resultBuf[3],
+ resultBuf[4], resultBuf[5], resultBuf[6], resultBuf[7],
+ resultBuf[8]);
+ expect.setFormat("buf={%d, %d, %d, %d, %d, %d, %d, %d, %d}",
+ expectBuf[0], expectBuf[1], expectBuf[2], expectBuf[3],
+ expectBuf[4], expectBuf[5], expectBuf[6], expectBuf[7],
+ expectBuf[8]);
+
+ return ::memcmp(resultBuf, expectBuf, 9) == 0;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocArgsFloat]
+// ============================================================================
+
+class X86Test_AllocArgsFloat : public X86Test {
+public:
+ X86Test_AllocArgsFloat() : X86Test("[Alloc] Args Float") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocArgsFloat());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature8<void, float, float, float, float, float, float, float, void*>(CallConv::kIdHost));
+
+ uint32_t i;
+
+ X86Gp p = cc.newIntPtr("p");
+ X86Xmm xv[7];
+
+ for (i = 0; i < 7; i++) {
+ xv[i] = cc.newXmmSs("xv%u", i);
+ cc.setArg(i, xv[i]);
+ }
+
+ cc.setArg(7, p);
+
+ cc.addss(xv[0], xv[1]);
+ cc.addss(xv[0], xv[2]);
+ cc.addss(xv[0], xv[3]);
+ cc.addss(xv[0], xv[4]);
+ cc.addss(xv[0], xv[5]);
+ cc.addss(xv[0], xv[6]);
+
+ cc.movss(x86::ptr(p), xv[0]);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(float, float, float, float, float, float, float, float*);
+ Func func = ptr_as_func<Func>(_func);
+
+ float resultRet;
+ float expectRet = 1.0f + 2.0f + 3.0f + 4.0f + 5.0f + 6.0f + 7.0f;
+
+ func(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, &resultRet);
+
+ result.setFormat("ret={%g}", resultRet);
+ expect.setFormat("ret={%g}", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocArgsDouble]
+// ============================================================================
+
+class X86Test_AllocArgsDouble : public X86Test {
+public:
+ X86Test_AllocArgsDouble() : X86Test("[Alloc] Args Double") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocArgsDouble());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature8<void, double, double, double, double, double, double, double, void*>(CallConv::kIdHost));
+
+ uint32_t i;
+
+ X86Gp p = cc.newIntPtr("p");
+ X86Xmm xv[7];
+
+ for (i = 0; i < 7; i++) {
+ xv[i] = cc.newXmmSd("xv%u", i);
+ cc.setArg(i, xv[i]);
+ }
+
+ cc.setArg(7, p);
+
+ cc.addsd(xv[0], xv[1]);
+ cc.addsd(xv[0], xv[2]);
+ cc.addsd(xv[0], xv[3]);
+ cc.addsd(xv[0], xv[4]);
+ cc.addsd(xv[0], xv[5]);
+ cc.addsd(xv[0], xv[6]);
+
+ cc.movsd(x86::ptr(p), xv[0]);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(double, double, double, double, double, double, double, double*);
+ Func func = ptr_as_func<Func>(_func);
+
+ double resultRet;
+ double expectRet = 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0 + 7.0;
+
+ func(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, &resultRet);
+
+ result.setFormat("ret={%g}", resultRet);
+ expect.setFormat("ret={%g}", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocRetFloat]
+// ============================================================================
+
+class X86Test_AllocRetFloat : public X86Test {
+public:
+ X86Test_AllocRetFloat() : X86Test("[Alloc] Ret Float") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocRetFloat());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<float, float, float>(CallConv::kIdHost));
+
+ X86Xmm a = cc.newXmmSs("a");
+ X86Xmm b = cc.newXmmSs("b");
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ cc.addss(a, b);
+ cc.ret(a);
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef float (*Func)(float, float);
+ Func func = ptr_as_func<Func>(_func);
+
+ float resultRet = func(1.0f, 2.0f);
+ float expectRet = 1.0f + 2.0f;
+
+ result.setFormat("ret={%g}", resultRet);
+ expect.setFormat("ret={%g}", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocRetDouble]
+// ============================================================================
+
+class X86Test_AllocRetDouble : public X86Test {
+public:
+ X86Test_AllocRetDouble() : X86Test("[Alloc] Ret Double") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocRetDouble());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<double, double, double>(CallConv::kIdHost));
+
+ X86Xmm a = cc.newXmmSd("a");
+ X86Xmm b = cc.newXmmSd("b");
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ cc.addsd(a, b);
+ cc.ret(a);
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef double (*Func)(double, double);
+ Func func = ptr_as_func<Func>(_func);
+
+ double resultRet = func(1.0, 2.0);
+ double expectRet = 1.0 + 2.0;
+
+ result.setFormat("ret={%g}", resultRet);
+ expect.setFormat("ret={%g}", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocStack1]
+// ============================================================================
+
+class X86Test_AllocStack1 : public X86Test {
+public:
+ X86Test_AllocStack1() : X86Test("[Alloc] Stack #1") {}
+
+ enum { kSize = 256 };
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocStack1());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ X86Mem stack = cc.newStack(kSize, 1);
+ stack.setSize(1);
+
+ X86Gp i = cc.newIntPtr("i");
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newInt32("b");
+
+ Label L_1 = cc.newLabel();
+ Label L_2 = cc.newLabel();
+
+ // Fill stack by sequence [0, 1, 2, 3 ... 255].
+ cc.xor_(i, i);
+
+ X86Mem stackWithIndex = stack.clone();
+ stackWithIndex.setIndex(i, 0);
+
+ cc.bind(L_1);
+ cc.mov(stackWithIndex, i.r8());
+ cc.inc(i);
+ cc.cmp(i, 255);
+ cc.jle(L_1);
+
+ // Sum sequence in stack.
+ cc.xor_(i, i);
+ cc.xor_(a, a);
+
+ cc.bind(L_2);
+ cc.movzx(b, stackWithIndex);
+ cc.add(a, b);
+ cc.inc(i);
+ cc.cmp(i, 255);
+ cc.jle(L_2);
+
+ cc.ret(a);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = 32640;
+
+ result.setInt(resultRet);
+ expect.setInt(expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocStack2]
+// ============================================================================
+
+class X86Test_AllocStack2 : public X86Test {
+public:
+ X86Test_AllocStack2() : X86Test("[Alloc] Stack #2") {}
+
+ enum { kSize = 256 };
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocStack2());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ const int kTokenSize = 32;
+
+ X86Mem s1 = cc.newStack(kTokenSize, 32);
+ X86Mem s2 = cc.newStack(kTokenSize, 32);
+
+ X86Gp p1 = cc.newIntPtr("p1");
+ X86Gp p2 = cc.newIntPtr("p2");
+
+ X86Gp ret = cc.newInt32("ret");
+ Label L_Exit = cc.newLabel();
+
+ static const char token[kTokenSize] = "-+:|abcdefghijklmnopqrstuvwxyz|";
+ CCFuncCall* call;
+
+ cc.lea(p1, s1);
+ cc.lea(p2, s2);
+
+ // Try to corrupt the stack if wrongly allocated.
+ call = cc.call(imm_ptr((void*)memcpy), FuncSignature3<void*, void*, void*, size_t>(CallConv::kIdHostCDecl));
+ call->setArg(0, p1);
+ call->setArg(1, imm_ptr(token));
+ call->setArg(2, imm(kTokenSize));
+ call->setRet(0, p1);
+
+ call = cc.call(imm_ptr((void*)memcpy), FuncSignature3<void*, void*, void*, size_t>(CallConv::kIdHostCDecl));
+ call->setArg(0, p2);
+ call->setArg(1, imm_ptr(token));
+ call->setArg(2, imm(kTokenSize));
+ call->setRet(0, p2);
+
+ call = cc.call(imm_ptr((void*)memcmp), FuncSignature3<int, void*, void*, size_t>(CallConv::kIdHostCDecl));
+ call->setArg(0, p1);
+ call->setArg(1, p2);
+ call->setArg(2, imm(kTokenSize));
+ call->setRet(0, ret);
+
+ // This should be 0 on success, however, if both `p1` and `p2` were
+ // allocated in the same address this check will still pass.
+ cc.cmp(ret, 0);
+ cc.jnz(L_Exit);
+
+ // Checks whether `p1` and `p2` are different (must be).
+ cc.xor_(ret, ret);
+ cc.cmp(p1, p2);
+ cc.setz(ret.r8());
+
+ cc.bind(L_Exit);
+ cc.ret(ret);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = 0; // Must be zero, stack addresses must be different.
+
+ result.setInt(resultRet);
+ expect.setInt(expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocMemcpy]
+// ============================================================================
+
+class X86Test_AllocMemcpy : public X86Test {
+public:
+ X86Test_AllocMemcpy() : X86Test("[Alloc] Memcpy") {}
+
+ enum { kCount = 32 };
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocMemcpy());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp dst = cc.newIntPtr("dst");
+ X86Gp src = cc.newIntPtr("src");
+ X86Gp cnt = cc.newUIntPtr("cnt");
+
+ Label L_Loop = cc.newLabel(); // Create base labels we use
+ Label L_Exit = cc.newLabel(); // in our function.
+
+ cc.addFunc(FuncSignature3<void, uint32_t*, const uint32_t*, size_t>(CallConv::kIdHost));
+ cc.setArg(0, dst);
+ cc.setArg(1, src);
+ cc.setArg(2, cnt);
+
+ cc.alloc(dst); // Allocate all registers now,
+ cc.alloc(src); // because we want to keep them
+ cc.alloc(cnt); // in physical registers only.
+
+ cc.test(cnt, cnt); // Exit if length is zero.
+ cc.jz(L_Exit);
+
+ cc.bind(L_Loop); // Bind the loop label here.
+
+ X86Gp tmp = cc.newInt32("tmp"); // Copy a single dword (4 bytes).
+ cc.mov(tmp, x86::dword_ptr(src));
+ cc.mov(x86::dword_ptr(dst), tmp);
+
+ cc.add(src, 4); // Increment dst/src pointers.
+ cc.add(dst, 4);
+
+ cc.dec(cnt); // Loop until cnt isn't zero.
+ cc.jnz(L_Loop);
+
+ cc.bind(L_Exit); // Bind the exit label here.
+ cc.endFunc(); // End of function.
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(uint32_t*, const uint32_t*, size_t);
+ Func func = ptr_as_func<Func>(_func);
+
+ uint32_t i;
+
+ uint32_t dstBuffer[kCount];
+ uint32_t srcBuffer[kCount];
+
+ for (i = 0; i < kCount; i++) {
+ dstBuffer[i] = 0;
+ srcBuffer[i] = i;
+ }
+
+ func(dstBuffer, srcBuffer, kCount);
+
+ result.setString("buf={");
+ expect.setString("buf={");
+
+ for (i = 0; i < kCount; i++) {
+ if (i != 0) {
+ result.appendString(", ");
+ expect.appendString(", ");
+ }
+
+ result.appendFormat("%u", static_cast<unsigned int>(dstBuffer[i]));
+ expect.appendFormat("%u", static_cast<unsigned int>(srcBuffer[i]));
+ }
+
+ result.appendString("}");
+ expect.appendString("}");
+
+ return ::memcmp(dstBuffer, srcBuffer, kCount * sizeof(uint32_t)) == 0;
+ }
+};
+
+// ============================================================================
+// [X86Test_AllocAlphaBlend]
+// ============================================================================
+
+class X86Test_AllocAlphaBlend : public X86Test {
+public:
+ X86Test_AllocAlphaBlend() : X86Test("[Alloc] AlphaBlend") {}
+
+ enum { kCount = 17 };
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_AllocAlphaBlend());
+ }
+
+ static uint32_t blendSrcOver(uint32_t d, uint32_t s) {
+ uint32_t saInv = ~s >> 24;
+
+ uint32_t d_20 = (d ) & 0x00FF00FF;
+ uint32_t d_31 = (d >> 8) & 0x00FF00FF;
+
+ d_20 *= saInv;
+ d_31 *= saInv;
+
+ d_20 = ((d_20 + ((d_20 >> 8) & 0x00FF00FFU) + 0x00800080U) & 0xFF00FF00U) >> 8;
+ d_31 = ((d_31 + ((d_31 >> 8) & 0x00FF00FFU) + 0x00800080U) & 0xFF00FF00U);
+
+ return d_20 + d_31 + s;
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ asmtest::generateAlphaBlend(cc);
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef void (*Func)(void*, const void*, size_t);
+ Func func = ptr_as_func<Func>(_func);
+
+ static const uint32_t dstConstData[] = { 0x00000000, 0x10101010, 0x20100804, 0x30200003, 0x40204040, 0x5000004D, 0x60302E2C, 0x706F6E6D, 0x807F4F2F, 0x90349001, 0xA0010203, 0xB03204AB, 0xC023AFBD, 0xD0D0D0C0, 0xE0AABBCC, 0xFFFFFFFF, 0xF8F4F2F1 };
+ static const uint32_t srcConstData[] = { 0xE0E0E0E0, 0xA0008080, 0x341F1E1A, 0xFEFEFEFE, 0x80302010, 0x49490A0B, 0x998F7798, 0x00000000, 0x01010101, 0xA0264733, 0xBAB0B1B9, 0xFF000000, 0xDAB0A0C1, 0xE0BACFDA, 0x99887766, 0xFFFFFF80, 0xEE0A5FEC };
+
+ uint32_t _dstBuffer[kCount + 3];
+ uint32_t _srcBuffer[kCount + 3];
+
+ // Has to be aligned.
+ uint32_t* dstBuffer = (uint32_t*)Utils::alignTo<intptr_t>((intptr_t)_dstBuffer, 16);
+ uint32_t* srcBuffer = (uint32_t*)Utils::alignTo<intptr_t>((intptr_t)_srcBuffer, 16);
+
+ ::memcpy(dstBuffer, dstConstData, sizeof(dstConstData));
+ ::memcpy(srcBuffer, srcConstData, sizeof(srcConstData));
+
+ uint32_t i;
+ uint32_t expBuffer[kCount];
+
+ for (i = 0; i < kCount; i++) {
+ expBuffer[i] = blendSrcOver(dstBuffer[i], srcBuffer[i]);
+ }
+
+ func(dstBuffer, srcBuffer, kCount);
+
+ result.setString("buf={");
+ expect.setString("buf={");
+
+ for (i = 0; i < kCount; i++) {
+ if (i != 0) {
+ result.appendString(", ");
+ expect.appendString(", ");
+ }
+
+ result.appendFormat("%08X", static_cast<unsigned int>(dstBuffer[i]));
+ expect.appendFormat("%08X", static_cast<unsigned int>(expBuffer[i]));
+ }
+
+ result.appendString("}");
+ expect.appendString("}");
+
+ return ::memcmp(expBuffer, dstBuffer, kCount * sizeof(uint32_t)) == 0;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallBase]
+// ============================================================================
+
+class X86Test_CallBase : public X86Test {
+public:
+ X86Test_CallBase() : X86Test("[Call] CDecl") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallBase());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp v0 = cc.newInt32("v0");
+ X86Gp v1 = cc.newInt32("v1");
+ X86Gp v2 = cc.newInt32("v2");
+
+ cc.addFunc(FuncSignature3<int, int, int, int>(CallConv::kIdHost));
+ cc.setArg(0, v0);
+ cc.setArg(1, v1);
+ cc.setArg(2, v2);
+
+ // Just do something.
+ cc.shl(v0, 1);
+ cc.shl(v1, 1);
+ cc.shl(v2, 1);
+
+ // Call a function.
+ X86Gp fn = cc.newIntPtr("fn");
+ cc.mov(fn, imm_ptr(calledFunc));
+
+ CCFuncCall* call = cc.call(fn, FuncSignature3<int, int, int, int>(CallConv::kIdHost));
+ call->setArg(0, v2);
+ call->setArg(1, v1);
+ call->setArg(2, v0);
+ call->setRet(0, v0);
+
+ cc.ret(v0);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(3, 2, 1);
+ int expectRet = 36;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ static int calledFunc(int a, int b, int c) { return (a + b) * c; }
+};
+
+// ============================================================================
+// [X86Test_CallFast]
+// ============================================================================
+
+class X86Test_CallFast : public X86Test {
+public:
+ X86Test_CallFast() : X86Test("[Call] Fastcall") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallFast());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp var = cc.newInt32("var");
+ X86Gp fn = cc.newIntPtr("fn");
+
+ cc.addFunc(FuncSignature1<int, int>(CallConv::kIdHost));
+ cc.setArg(0, var);
+
+ cc.mov(fn, imm_ptr(calledFunc));
+ CCFuncCall* call;
+
+ call = cc.call(fn, FuncSignature1<int, int>(CallConv::kIdHostFastCall));
+ call->setArg(0, var);
+ call->setRet(0, var);
+
+ call = cc.call(fn, FuncSignature1<int, int>(CallConv::kIdHostFastCall));
+ call->setArg(0, var);
+ call->setRet(0, var);
+
+ cc.ret(var);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(9);
+ int expectRet = (9 * 9) * (9 * 9);
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ // Function that is called inside the generated one. Because this test is
+ // mainly about register arguments, we need to use the fastcall calling
+ // convention when running 32-bit.
+ static int ASMJIT_FASTCALL calledFunc(int a) { return a * a; }
+};
+
+// ============================================================================
+// [X86Test_CallManyArgs]
+// ============================================================================
+
+class X86Test_CallManyArgs : public X86Test {
+public:
+ X86Test_CallManyArgs() : X86Test("[Call] Many Args") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallManyArgs());
+ }
+
+ static int calledFunc(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j) {
+ return (a * b * c * d * e) + (f * g * h * i * j);
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ // Prepare.
+ X86Gp fn = cc.newIntPtr("fn");
+ X86Gp va = cc.newInt32("va");
+ X86Gp vb = cc.newInt32("vb");
+ X86Gp vc = cc.newInt32("vc");
+ X86Gp vd = cc.newInt32("vd");
+ X86Gp ve = cc.newInt32("ve");
+ X86Gp vf = cc.newInt32("vf");
+ X86Gp vg = cc.newInt32("vg");
+ X86Gp vh = cc.newInt32("vh");
+ X86Gp vi = cc.newInt32("vi");
+ X86Gp vj = cc.newInt32("vj");
+
+ cc.mov(fn, imm_ptr(calledFunc));
+ cc.mov(va, 0x03);
+ cc.mov(vb, 0x12);
+ cc.mov(vc, 0xA0);
+ cc.mov(vd, 0x0B);
+ cc.mov(ve, 0x2F);
+ cc.mov(vf, 0x02);
+ cc.mov(vg, 0x0C);
+ cc.mov(vh, 0x12);
+ cc.mov(vi, 0x18);
+ cc.mov(vj, 0x1E);
+
+ // Call function.
+ CCFuncCall* call = cc.call(fn, FuncSignature10<int, int, int, int, int, int, int, int, int, int, int>(CallConv::kIdHost));
+ call->setArg(0, va);
+ call->setArg(1, vb);
+ call->setArg(2, vc);
+ call->setArg(3, vd);
+ call->setArg(4, ve);
+ call->setArg(5, vf);
+ call->setArg(6, vg);
+ call->setArg(7, vh);
+ call->setArg(8, vi);
+ call->setArg(9, vj);
+ call->setRet(0, va);
+
+ cc.ret(va);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = calledFunc(0x03, 0x12, 0xA0, 0x0B, 0x2F, 0x02, 0x0C, 0x12, 0x18, 0x1E);
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallDuplicateArgs]
+// ============================================================================
+
+class X86Test_CallDuplicateArgs : public X86Test {
+public:
+ X86Test_CallDuplicateArgs() : X86Test("[Call] Duplicate Args") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallDuplicateArgs());
+ }
+
+ static int calledFunc(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j) {
+ return (a * b * c * d * e) + (f * g * h * i * j);
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ // Prepare.
+ X86Gp fn = cc.newIntPtr("fn");
+ X86Gp a = cc.newInt32("a");
+
+ cc.mov(fn, imm_ptr(calledFunc));
+ cc.mov(a, 3);
+
+ // Call function.
+ CCFuncCall* call = cc.call(fn, FuncSignature10<int, int, int, int, int, int, int, int, int, int, int>(CallConv::kIdHost));
+ call->setArg(0, a);
+ call->setArg(1, a);
+ call->setArg(2, a);
+ call->setArg(3, a);
+ call->setArg(4, a);
+ call->setArg(5, a);
+ call->setArg(6, a);
+ call->setArg(7, a);
+ call->setArg(8, a);
+ call->setArg(9, a);
+ call->setRet(0, a);
+
+ cc.ret(a);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = calledFunc(3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallImmArgs]
+// ============================================================================
+
+class X86Test_CallImmArgs : public X86Test {
+public:
+ X86Test_CallImmArgs() : X86Test("[Call] Imm Args") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallImmArgs());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ // Prepare.
+ X86Gp fn = cc.newIntPtr("fn");
+ X86Gp rv = cc.newInt32("rv");
+
+ cc.mov(fn, imm_ptr(X86Test_CallManyArgs::calledFunc));
+
+ // Call function.
+ CCFuncCall* call = cc.call(fn, FuncSignature10<int, int, int, int, int, int, int, int, int, int, int>(CallConv::kIdHost));
+ call->setArg(0, imm(0x03));
+ call->setArg(1, imm(0x12));
+ call->setArg(2, imm(0xA0));
+ call->setArg(3, imm(0x0B));
+ call->setArg(4, imm(0x2F));
+ call->setArg(5, imm(0x02));
+ call->setArg(6, imm(0x0C));
+ call->setArg(7, imm(0x12));
+ call->setArg(8, imm(0x18));
+ call->setArg(9, imm(0x1E));
+ call->setRet(0, rv);
+
+ cc.ret(rv);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = X86Test_CallManyArgs::calledFunc(0x03, 0x12, 0xA0, 0x0B, 0x2F, 0x02, 0x0C, 0x12, 0x18, 0x1E);
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallPtrArgs]
+// ============================================================================
+
+class X86Test_CallPtrArgs : public X86Test {
+public:
+ X86Test_CallPtrArgs() : X86Test("[Call] Ptr Args") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallPtrArgs());
+ }
+
+ static int calledFunc(void* a, void* b, void* c, void* d, void* e, void* f, void* g, void* h, void* i, void* j) {
+ return static_cast<int>((intptr_t)a) +
+ static_cast<int>((intptr_t)b) +
+ static_cast<int>((intptr_t)c) +
+ static_cast<int>((intptr_t)d) +
+ static_cast<int>((intptr_t)e) +
+ static_cast<int>((intptr_t)f) +
+ static_cast<int>((intptr_t)g) +
+ static_cast<int>((intptr_t)h) +
+ static_cast<int>((intptr_t)i) +
+ static_cast<int>((intptr_t)j) ;
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ // Prepare.
+ X86Gp fn = cc.newIntPtr("fn");
+ X86Gp rv = cc.newInt32("rv");
+
+ cc.mov(fn, imm_ptr(calledFunc));
+
+ // Call function.
+ CCFuncCall* call = cc.call(fn, FuncSignature10<int, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*>(CallConv::kIdHost));
+ call->setArg(0, imm(0x01));
+ call->setArg(1, imm(0x02));
+ call->setArg(2, imm(0x03));
+ call->setArg(3, imm(0x04));
+ call->setArg(4, imm(0x05));
+ call->setArg(5, imm(0x06));
+ call->setArg(6, imm(0x07));
+ call->setArg(7, imm(0x08));
+ call->setArg(8, imm(0x09));
+ call->setArg(9, imm(0x0A));
+ call->setRet(0, rv);
+
+ cc.ret(rv);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = 55;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallFloatAsXmmRet]
+// ============================================================================
+
+class X86Test_CallFloatAsXmmRet : public X86Test {
+public:
+ X86Test_CallFloatAsXmmRet() : X86Test("[Call] Float As Xmm Ret") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallFloatAsXmmRet());
+ }
+
+ static float calledFunc(float a, float b) {
+ return a * b;
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<float, float, float>(CallConv::kIdHost));
+
+ X86Xmm a = cc.newXmmSs("a");
+ X86Xmm b = cc.newXmmSs("b");
+ X86Xmm ret = cc.newXmmSs("ret");
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ // Prepare.
+ X86Gp fn = cc.newIntPtr("fn");
+ cc.mov(fn, imm_ptr(calledFunc));
+
+ // Call function.
+ CCFuncCall* call = cc.call(fn, FuncSignature2<float, float, float>(CallConv::kIdHost));
+
+ call->setArg(0, a);
+ call->setArg(1, b);
+ call->setRet(0, ret);
+
+ cc.ret(ret);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef float (*Func)(float, float);
+ Func func = ptr_as_func<Func>(_func);
+
+ float resultRet = func(15.5f, 2.0f);
+ float expectRet = calledFunc(15.5f, 2.0f);
+
+ result.setFormat("ret=%g", resultRet);
+ expect.setFormat("ret=%g", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallDoubleAsXmmRet]
+// ============================================================================
+
+class X86Test_CallDoubleAsXmmRet : public X86Test {
+public:
+ X86Test_CallDoubleAsXmmRet() : X86Test("[Call] Double As Xmm Ret") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallDoubleAsXmmRet());
+ }
+
+ static double calledFunc(double a, double b) {
+ return a * b;
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature2<double, double, double>(CallConv::kIdHost));
+
+ X86Xmm a = cc.newXmmSd("a");
+ X86Xmm b = cc.newXmmSd("b");
+ X86Xmm ret = cc.newXmmSd("ret");
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ X86Gp fn = cc.newIntPtr("fn");
+ cc.mov(fn, imm_ptr(calledFunc));
+
+ CCFuncCall* call = cc.call(fn, FuncSignature2<double, double, double>(CallConv::kIdHost));
+
+ call->setArg(0, a);
+ call->setArg(1, b);
+ call->setRet(0, ret);
+
+ cc.ret(ret);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef double (*Func)(double, double);
+ Func func = ptr_as_func<Func>(_func);
+
+ double resultRet = func(15.5, 2.0);
+ double expectRet = calledFunc(15.5, 2.0);
+
+ result.setFormat("ret=%g", resultRet);
+ expect.setFormat("ret=%g", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallConditional]
+// ============================================================================
+
+class X86Test_CallConditional : public X86Test {
+public:
+ X86Test_CallConditional() : X86Test("[Call] Conditional") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallConditional());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp x = cc.newInt32("x");
+ X86Gp y = cc.newInt32("y");
+ X86Gp op = cc.newInt32("op");
+
+ CCFuncCall* call;
+ X86Gp result;
+
+ cc.addFunc(FuncSignature3<int, int, int, int>(CallConv::kIdHost));
+ cc.setArg(0, x);
+ cc.setArg(1, y);
+ cc.setArg(2, op);
+
+ Label opAdd = cc.newLabel();
+ Label opMul = cc.newLabel();
+
+ cc.cmp(op, 0);
+ cc.jz(opAdd);
+ cc.cmp(op, 1);
+ cc.jz(opMul);
+
+ result = cc.newInt32("result_0");
+ cc.mov(result, 0);
+ cc.ret(result);
+
+ cc.bind(opAdd);
+ result = cc.newInt32("result_1");
+
+ call = cc.call((uint64_t)calledFuncAdd, FuncSignature2<int, int, int>(CallConv::kIdHost));
+ call->setArg(0, x);
+ call->setArg(1, y);
+ call->setRet(0, result);
+ cc.ret(result);
+
+ cc.bind(opMul);
+ result = cc.newInt32("result_2");
+
+ call = cc.call((uint64_t)calledFuncMul, FuncSignature2<int, int, int>(CallConv::kIdHost));
+ call->setArg(0, x);
+ call->setArg(1, y);
+ call->setRet(0, result);
+
+ cc.ret(result);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int arg1 = 4;
+ int arg2 = 8;
+
+ int resultAdd = func(arg1, arg2, 0);
+ int expectAdd = calledFuncAdd(arg1, arg2);
+
+ int resultMul = func(arg1, arg2, 1);
+ int expectMul = calledFuncMul(arg1, arg2);
+
+ result.setFormat("ret={add=%d, mul=%d}", resultAdd, resultMul);
+ expect.setFormat("ret={add=%d, mul=%d}", expectAdd, expectMul);
+
+ return (resultAdd == expectAdd) && (resultMul == expectMul);
+ }
+
+ static int calledFuncAdd(int x, int y) { return x + y; }
+ static int calledFuncMul(int x, int y) { return x * y; }
+};
+
+// ============================================================================
+// [X86Test_CallMultiple]
+// ============================================================================
+
+class X86Test_CallMultiple : public X86Test {
+public:
+ X86Test_CallMultiple() : X86Test("[Call] Multiple") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallMultiple());
+ }
+
+ static int ASMJIT_FASTCALL calledFunc(int* pInt, int index) {
+ return pInt[index];
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ unsigned int i;
+
+ X86Gp buf = cc.newIntPtr("buf");
+ X86Gp acc0 = cc.newInt32("acc0");
+ X86Gp acc1 = cc.newInt32("acc1");
+
+ cc.addFunc(FuncSignature1<int, int*>(CallConv::kIdHost));
+ cc.setArg(0, buf);
+
+ cc.mov(acc0, 0);
+ cc.mov(acc1, 0);
+
+ for (i = 0; i < 4; i++) {
+ X86Gp ret = cc.newInt32("ret");
+ X86Gp ptr = cc.newIntPtr("ptr");
+ X86Gp idx = cc.newInt32("idx");
+ CCFuncCall* call;
+
+ cc.mov(ptr, buf);
+ cc.mov(idx, static_cast<int>(i));
+
+ call = cc.call((uint64_t)calledFunc, FuncSignature2<int, int*, int>(CallConv::kIdHostFastCall));
+ call->setArg(0, ptr);
+ call->setArg(1, idx);
+ call->setRet(0, ret);
+
+ cc.add(acc0, ret);
+
+ cc.mov(ptr, buf);
+ cc.mov(idx, static_cast<int>(i));
+
+ call = cc.call((uint64_t)calledFunc, FuncSignature2<int, int*, int>(CallConv::kIdHostFastCall));
+ call->setArg(0, ptr);
+ call->setArg(1, idx);
+ call->setRet(0, ret);
+
+ cc.sub(acc1, ret);
+ }
+
+ cc.add(acc0, acc1);
+ cc.ret(acc0);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int*);
+ Func func = ptr_as_func<Func>(_func);
+
+ int buffer[4] = { 127, 87, 23, 17 };
+
+ int resultRet = func(buffer);
+ int expectRet = 0;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallRecursive]
+// ============================================================================
+
+class X86Test_CallRecursive : public X86Test {
+public:
+ X86Test_CallRecursive() : X86Test("[Call] Recursive") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallRecursive());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp val = cc.newInt32("val");
+ Label skip = cc.newLabel();
+
+ CCFunc* func = cc.addFunc(FuncSignature1<int, int>(CallConv::kIdHost));
+ cc.setArg(0, val);
+
+ cc.cmp(val, 1);
+ cc.jle(skip);
+
+ X86Gp tmp = cc.newInt32("tmp");
+ cc.mov(tmp, val);
+ cc.dec(tmp);
+
+ CCFuncCall* call = cc.call(func->getLabel(), FuncSignature1<int, int>(CallConv::kIdHost));
+ call->setArg(0, tmp);
+ call->setRet(0, tmp);
+ cc.mul(cc.newInt32(), val, tmp);
+
+ cc.bind(skip);
+ cc.ret(val);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(5);
+ int expectRet = 1 * 2 * 3 * 4 * 5;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallMisc1]
+// ============================================================================
+
+class X86Test_CallMisc1 : public X86Test {
+public:
+ X86Test_CallMisc1() : X86Test("[Call] Misc #1") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallMisc1());
+ }
+
+ static void dummy(int a, int b) {}
+
+ virtual void compile(X86Compiler& cc) {
+ X86Gp val = cc.newInt32("val");
+ Label skip = cc.newLabel();
+
+ CCFunc* func = cc.addFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newInt32("b");
+ X86Gp r = cc.newInt32("r");
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ cc.alloc(a, x86::eax);
+ cc.alloc(b, x86::ebx);
+
+ CCFuncCall* call = cc.call(imm_ptr(dummy), FuncSignature2<void, int, int>(CallConv::kIdHost));
+ call->setArg(0, a);
+ call->setArg(1, b);
+
+ cc.lea(r, x86::ptr(a, b));
+ cc.ret(r);
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(44, 199);
+ int expectRet = 243;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_CallMisc2]
+// ============================================================================
+
+class X86Test_CallMisc2 : public X86Test {
+public:
+ X86Test_CallMisc2() : X86Test("[Call] Misc #2") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallMisc2());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ CCFunc* func = cc.addFunc(FuncSignature1<double, const double*>(CallConv::kIdHost));
+
+ X86Gp p = cc.newIntPtr("p");
+ X86Gp fn = cc.newIntPtr("fn");
+
+ X86Xmm arg = cc.newXmmSd("arg");
+ X86Xmm ret = cc.newXmmSd("ret");
+
+ cc.setArg(0, p);
+ cc.movsd(arg, x86::ptr(p));
+ cc.mov(fn, imm_ptr(op));
+
+ CCFuncCall* call = cc.call(fn, FuncSignature1<double, double>(CallConv::kIdHost));
+ call->setArg(0, arg);
+ call->setRet(0, ret);
+
+ cc.ret(ret);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef double (*Func)(const double*);
+ Func func = ptr_as_func<Func>(_func);
+
+ double arg = 2;
+
+ double resultRet = func(&arg);
+ double expectRet = op(arg);
+
+ result.setFormat("ret=%g", resultRet);
+ expect.setFormat("ret=%g", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ static double op(double a) { return a * a; }
+};
+
+// ============================================================================
+// [X86Test_CallMisc3]
+// ============================================================================
+
+class X86Test_CallMisc3 : public X86Test {
+public:
+ X86Test_CallMisc3() : X86Test("[Call] Misc #3") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallMisc3());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ CCFunc* func = cc.addFunc(FuncSignature1<double, const double*>(CallConv::kIdHost));
+
+ X86Gp p = cc.newIntPtr("p");
+ X86Gp fn = cc.newIntPtr("fn");
+
+ X86Xmm arg = cc.newXmmSd("arg");
+ X86Xmm ret = cc.newXmmSd("ret");
+
+ cc.setArg(0, p);
+ cc.movsd(arg, x86::ptr(p));
+ cc.mov(fn, imm_ptr(op));
+
+ CCFuncCall* call = cc.call(fn, FuncSignature1<double, double>(CallConv::kIdHost));
+ call->setArg(0, arg);
+ call->setRet(0, ret);
+
+ cc.xorps(arg, arg);
+ cc.subsd(arg, ret);
+
+ cc.ret(arg);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef double (*Func)(const double*);
+ Func func = ptr_as_func<Func>(_func);
+
+ double arg = 2;
+
+ double resultRet = func(&arg);
+ double expectRet = -op(arg);
+
+ result.setFormat("ret=%g", resultRet);
+ expect.setFormat("ret=%g", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ static double op(double a) { return a * a; }
+};
+
+// ============================================================================
+// [X86Test_CallMisc4]
+// ============================================================================
+
+class X86Test_CallMisc4 : public X86Test {
+public:
+ X86Test_CallMisc4() : X86Test("[Call] Misc #4") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallMisc4());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ FuncSignatureX funcPrototype;
+
+ funcPrototype.setCallConv(CallConv::kIdHost);
+ funcPrototype.setRet(TypeId::kF64);
+ CCFunc* func = cc.addFunc(funcPrototype);
+
+ FuncSignatureX callPrototype;
+ callPrototype.setCallConv(CallConv::kIdHost);
+ callPrototype.setRet(TypeId::kF64);
+ CCFuncCall* call = cc.call(imm_ptr(calledFunc), callPrototype);
+
+ X86Xmm ret = cc.newXmmSd("ret");
+ call->setRet(0, ret);
+ cc.ret(ret);
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef double (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ double resultRet = func();
+ double expectRet = 3.14;
+
+ result.setFormat("ret=%g", resultRet);
+ expect.setFormat("ret=%g", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ static double calledFunc() { return 3.14; }
+};
+
+// ============================================================================
+// [X86Test_CallMisc5]
+// ============================================================================
+
+// The register allocator should clobber the register used by the `call` itself.
+class X86Test_CallMisc5 : public X86Test {
+public:
+ X86Test_CallMisc5() : X86Test("[Call] Misc #5") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_CallMisc5());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ CCFunc* func = cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ X86Gp pFn = cc.newIntPtr("pFn");
+ X86Gp vars[16];
+
+ uint32_t i, regCount = cc.getGpCount();
+ ASMJIT_ASSERT(regCount <= ASMJIT_ARRAY_SIZE(vars));
+
+ cc.mov(pFn, imm_ptr(calledFunc));
+ cc.spill(pFn);
+
+ for (i = 0; i < regCount; i++) {
+ if (i == X86Gp::kIdBp || i == X86Gp::kIdSp)
+ continue;
+
+ vars[i] = cc.newInt32("v%u", static_cast<unsigned int>(i));
+ cc.alloc(vars[i], i);
+ cc.mov(vars[i], 1);
+ }
+
+ CCFuncCall* call = cc.call(pFn, FuncSignature0<void>(CallConv::kIdHost));
+
+ for (i = 1; i < regCount; i++) {
+ if (vars[i].isValid())
+ cc.add(vars[0], vars[i]);
+ }
+
+ cc.ret(vars[0]);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = sizeof(void*) == 4 ? 6 : 14;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ static void calledFunc() {}
+};
+
+// ============================================================================
+// [X86Test_MiscConstPool]
+// ============================================================================
+
+class X86Test_MiscConstPool : public X86Test {
+public:
+ X86Test_MiscConstPool() : X86Test("[Misc] ConstPool #1") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_MiscConstPool());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature0<int>(CallConv::kIdHost));
+
+ X86Gp v0 = cc.newInt32("v0");
+ X86Gp v1 = cc.newInt32("v1");
+
+ X86Mem c0 = cc.newInt32Const(kConstScopeLocal, 200);
+ X86Mem c1 = cc.newInt32Const(kConstScopeLocal, 33);
+
+ cc.mov(v0, c0);
+ cc.mov(v1, c1);
+ cc.add(v0, v1);
+
+ cc.ret(v0);
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(void);
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func();
+ int expectRet = 233;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return resultRet == expectRet;
+ }
+};
+
+// ============================================================================
+// [X86Test_MiscMultiRet]
+// ============================================================================
+
+struct X86Test_MiscMultiRet : public X86Test {
+ X86Test_MiscMultiRet() : X86Test("[Misc] MultiRet") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_MiscMultiRet());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ cc.addFunc(FuncSignature3<int, int, int, int>(CallConv::kIdHost));
+
+ X86Gp op = cc.newInt32("op");
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newInt32("b");
+
+ Label L_Zero = cc.newLabel();
+ Label L_Add = cc.newLabel();
+ Label L_Sub = cc.newLabel();
+ Label L_Mul = cc.newLabel();
+ Label L_Div = cc.newLabel();
+
+ cc.setArg(0, op);
+ cc.setArg(1, a);
+ cc.setArg(2, b);
+
+ cc.cmp(op, 0);
+ cc.jz(L_Add);
+
+ cc.cmp(op, 1);
+ cc.jz(L_Sub);
+
+ cc.cmp(op, 2);
+ cc.jz(L_Mul);
+
+ cc.cmp(op, 3);
+ cc.jz(L_Div);
+
+ cc.bind(L_Zero);
+ cc.xor_(a, a);
+ cc.ret(a);
+
+ cc.bind(L_Add);
+ cc.add(a, b);
+ cc.ret(a);
+
+ cc.bind(L_Sub);
+ cc.sub(a, b);
+ cc.ret(a);
+
+ cc.bind(L_Mul);
+ cc.imul(a, b);
+ cc.ret(a);
+
+ cc.bind(L_Div);
+ cc.cmp(b, 0);
+ cc.jz(L_Zero);
+
+ X86Gp zero = cc.newInt32("zero");
+ cc.xor_(zero, zero);
+ cc.idiv(zero, a, b);
+ cc.ret(a);
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int, int);
+
+ Func func = ptr_as_func<Func>(_func);
+
+ int a = 44;
+ int b = 3;
+
+ int r0 = func(0, a, b);
+ int r1 = func(1, a, b);
+ int r2 = func(2, a, b);
+ int r3 = func(3, a, b);
+ int e0 = a + b;
+ int e1 = a - b;
+ int e2 = a * b;
+ int e3 = a / b;
+
+ result.setFormat("ret={%d %d %d %d}", r0, r1, r2, r3);
+ expect.setFormat("ret={%d %d %d %d}", e0, e1, e2, e3);
+
+ return result.eq(expect);
+ }
+};
+
+// ============================================================================
+// [X86Test_MiscMultiFunc]
+// ============================================================================
+
+class X86Test_MiscMultiFunc : public X86Test {
+public:
+ X86Test_MiscMultiFunc() : X86Test("[Misc] MultiFunc") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_MiscMultiFunc());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ CCFunc* f1 = cc.newFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+ CCFunc* f2 = cc.newFunc(FuncSignature2<int, int, int>(CallConv::kIdHost));
+
+ {
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newInt32("b");
+
+ cc.addFunc(f1);
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ CCFuncCall* call = cc.call(f2->getLabel(), FuncSignature2<int, int, int>(CallConv::kIdHost));
+ call->setArg(0, a);
+ call->setArg(1, b);
+ call->setRet(0, a);
+
+ cc.ret(a);
+ cc.endFunc();
+ }
+
+ {
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newInt32("b");
+
+ cc.addFunc(f2);
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ cc.add(a, b);
+ cc.ret(a);
+ cc.endFunc();
+ }
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (*Func)(int, int);
+
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = func(56, 22);
+ int expectRet = 56 + 22;
+
+ result.setFormat("ret=%d", resultRet);
+ expect.setFormat("ret=%d", expectRet);
+
+ return result.eq(expect);
+ }
+};
+
+// ============================================================================
+// [X86Test_MiscUnfollow]
+// ============================================================================
+
+// Global (I didn't find a better way to test this).
+static jmp_buf globalJmpBuf;
+
+class X86Test_MiscUnfollow : public X86Test {
+public:
+ X86Test_MiscUnfollow() : X86Test("[Misc] Unfollow") {}
+
+ static void add(X86TestManager& mgr) {
+ mgr.add(new X86Test_MiscUnfollow());
+ }
+
+ virtual void compile(X86Compiler& cc) {
+ // NOTE: Fastcall calling convention is the most appropriate here, as all
+ // arguments will be passed by registers and there won't be any stack
+ // misalignment when we call the `handler()`. This was failing on OSX
+ // when targeting 32-bit.
+ cc.addFunc(FuncSignature2<void, int, void*>(CallConv::kIdHostFastCall));
+
+ X86Gp a = cc.newInt32("a");
+ X86Gp b = cc.newIntPtr("b");
+
+ Label tramp = cc.newLabel();
+
+ cc.setArg(0, a);
+ cc.setArg(1, b);
+
+ cc.cmp(a, 0);
+ cc.jz(tramp);
+
+ cc.ret(a);
+
+ cc.bind(tramp);
+ cc.unfollow().jmp(b);
+
+ cc.endFunc();
+ }
+
+ virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) {
+ typedef int (ASMJIT_FASTCALL *Func)(int, void*);
+
+ Func func = ptr_as_func<Func>(_func);
+
+ int resultRet = 0;
+ int expectRet = 1;
+
+ if (!setjmp(globalJmpBuf))
+ resultRet = func(0, (void*)handler);
+ else
+ resultRet = 1;
+
+ result.setFormat("ret={%d}", resultRet);
+ expect.setFormat("ret={%d}", expectRet);
+
+ return resultRet == expectRet;
+ }
+
+ static void ASMJIT_FASTCALL handler() { longjmp(globalJmpBuf, 1); }
+};
+
+// ============================================================================
+// [CmdLine]
+// ============================================================================
+
+class CmdLine {
+public:
+ CmdLine(int argc, char** argv)
+ : _argc(argc),
+ _argv(argv) {}
+
+ bool hasArg(const char* arg) {
+ for (int i = 1; i < _argc; i++) {
+ if (::strcmp(_argv[i], arg) == 0)
+ return true;
+ }
+ return false;
+ }
+
+ // --------------------------------------------------------------------------
+ // [Members]
+ // --------------------------------------------------------------------------
+
+ int _argc;
+ char** _argv;
+};
+
+// ============================================================================
+// [Main]
+// ============================================================================
+
+#define ADD_TEST(CLASS) CLASS::add(testMgr)
+
+int main(int argc, char* argv[]) {
+ X86TestManager testMgr;
+ CmdLine cmd(argc, argv);
+
+ if (cmd.hasArg("--verbose"))
+ testMgr._verbose = true;
+
+ // Align.
+ ADD_TEST(X86Test_AlignBase);
+ ADD_TEST(X86Test_AlignNone);
+
+ // Jump.
+ ADD_TEST(X86Test_JumpCross);
+
+ ADD_TEST(X86Test_JumpMany);
+ ADD_TEST(X86Test_JumpUnreachable1);
+ ADD_TEST(X86Test_JumpUnreachable2);
+
+ // Alloc.
+ ADD_TEST(X86Test_AllocBase);
+ ADD_TEST(X86Test_AllocManual);
+ ADD_TEST(X86Test_AllocUseMem);
+ ADD_TEST(X86Test_AllocMany1);
+ ADD_TEST(X86Test_AllocMany2);
+ ADD_TEST(X86Test_AllocImul1);
+ ADD_TEST(X86Test_AllocImul2);
+ ADD_TEST(X86Test_AllocIdiv1);
+ ADD_TEST(X86Test_AllocSetz);
+ ADD_TEST(X86Test_AllocShlRor);
+ ADD_TEST(X86Test_AllocGpLo);
+ ADD_TEST(X86Test_AllocRepMovsb);
+ ADD_TEST(X86Test_AllocIfElse1);
+ ADD_TEST(X86Test_AllocIfElse2);
+ ADD_TEST(X86Test_AllocIfElse3);
+ ADD_TEST(X86Test_AllocIfElse4);
+ ADD_TEST(X86Test_AllocInt8);
+ ADD_TEST(X86Test_AllocArgsIntPtr);
+ ADD_TEST(X86Test_AllocArgsFloat);
+ ADD_TEST(X86Test_AllocArgsDouble);
+ ADD_TEST(X86Test_AllocRetFloat);
+ ADD_TEST(X86Test_AllocRetDouble);
+ ADD_TEST(X86Test_AllocStack1);
+ ADD_TEST(X86Test_AllocStack2);
+ ADD_TEST(X86Test_AllocMemcpy);
+ ADD_TEST(X86Test_AllocAlphaBlend);
+
+ // Call.
+ ADD_TEST(X86Test_CallBase);
+ ADD_TEST(X86Test_CallFast);
+ ADD_TEST(X86Test_CallManyArgs);
+ ADD_TEST(X86Test_CallDuplicateArgs);
+ ADD_TEST(X86Test_CallImmArgs);
+ ADD_TEST(X86Test_CallPtrArgs);
+ ADD_TEST(X86Test_CallFloatAsXmmRet);
+ ADD_TEST(X86Test_CallDoubleAsXmmRet);
+ ADD_TEST(X86Test_CallConditional);
+ ADD_TEST(X86Test_CallMultiple);
+ ADD_TEST(X86Test_CallRecursive);
+ ADD_TEST(X86Test_CallMisc1);
+ ADD_TEST(X86Test_CallMisc2);
+ ADD_TEST(X86Test_CallMisc3);
+ ADD_TEST(X86Test_CallMisc4);
+ ADD_TEST(X86Test_CallMisc5);
+
+ // Misc.
+ ADD_TEST(X86Test_MiscConstPool);
+ ADD_TEST(X86Test_MiscMultiRet);
+ ADD_TEST(X86Test_MiscMultiFunc);
+ ADD_TEST(X86Test_MiscUnfollow);
+
+ return testMgr.run();
+}
diff --git a/test/broken.cpp b/test/broken.cpp
new file mode 100644
index 0000000..4dd91e8
--- /dev/null
+++ b/test/broken.cpp
@@ -0,0 +1,278 @@
+// [Broken]
+// Lightweight Unit Testing for C++.
+//
+// [License]
+// Public Domain (Unlicense)
+
+// [Dependencies]
+#include "./broken.h"
+
+// ============================================================================
+// [Broken - Global]
+// ============================================================================
+
+// Zero initialized globals.
+struct BrokenGlobal {
+ // --------------------------------------------------------------------------
+ // [Accessors]
+ // --------------------------------------------------------------------------
+
+ bool hasArg(const char* a) const {
+ int argc = _argc;
+ const char** argv = _argv;
+
+ for (int i = 1; i < argc; i++) {
+ if (::strcmp(argv[i], a) == 0)
+ return true;
+ }
+
+ return false;
+ }
+
+ FILE* getFile() const {
+ return _file ? _file : stdout;
+ }
+
+ // --------------------------------------------------------------------------
+ // [Members]
+ // --------------------------------------------------------------------------
+
+ // Application arguments.
+ int _argc;
+ const char** _argv;
+
+ // Output file.
+ FILE* _file;
+
+ // Current context.
+ const char* _currentFile;
+ int _currentLine;
+
+ // Unit tests.
+ BrokenAPI::Unit* _unitList;
+ BrokenAPI::Unit* _unitRunning;
+};
+static BrokenGlobal _brokenGlobal;
+
+// ============================================================================
+// [Broken - API]
+// ============================================================================
+
+// Get whether the string `a` starts with string `b`.
+static bool BrokenAPI_startsWith(const char* a, const char* b) {
+ for (size_t i = 0; ; i++) {
+ if (b[i] == '\0') return true;
+ if (a[i] != b[i]) return false;
+ }
+}
+
+// Get whether the strings `a` and `b` are equal, ignoring case and treating
+// `-` as `_`.
+static bool BrokenAPI_matchesFilter(const char* a, const char* b) {
+ for (size_t i = 0; ; i++) {
+ unsigned char ca = static_cast<unsigned char>(a[i]);
+ unsigned char cb = static_cast<unsigned char>(b[i]);
+
+ // If filter is defined as wildcard the rest automatically matches.
+ if (cb == '*')
+ return true;
+
+ if (ca == '-') ca = '_';
+ if (cb == '-') cb = '_';
+
+ if (ca >= 'A' && ca <= 'Z') ca += 'a' - 'A';
+ if (cb >= 'A' && cb <= 'Z') cb += 'a' - 'A';
+
+ if (ca != cb)
+ return false;
+
+ if (ca == '\0')
+ return true;
+ }
+}
+
+static bool BrokenAPI_canRun(BrokenAPI::Unit* unit) {
+ BrokenGlobal& global = _brokenGlobal;
+
+ int i, argc = global._argc;
+ const char** argv = global._argv;
+
+ const char* unitName = unit->name;
+ bool hasFilter = false;
+
+ for (i = 1; i < argc; i++) {
+ const char* arg = argv[i];
+
+ if (BrokenAPI_startsWith(arg, "--run-") && ::strcmp(arg, "--run-all") != 0) {
+ hasFilter = true;
+
+ if (BrokenAPI_matchesFilter(unitName, arg + 6))
+ return true;
+ }
+ }
+
+ // If no filter has been specified the default is to run.
+ return !hasFilter;
+}
+
+static void BrokenAPI_runUnit(BrokenAPI::Unit* unit) {
+ BrokenAPI::info("Running %s", unit->name);
+
+ _brokenGlobal._unitRunning = unit;
+ unit->entry();
+ _brokenGlobal._unitRunning = NULL;
+}
+
+static void BrokenAPI_runAll() {
+ BrokenAPI::Unit* unit = _brokenGlobal._unitList;
+
+ bool hasUnits = unit != NULL;
+ size_t count = 0;
+
+ while (unit != NULL) {
+ if (BrokenAPI_canRun(unit)) {
+ BrokenAPI_runUnit(unit);
+ count++;
+ }
+ unit = unit->next;
+ }
+
+ if (count) {
+ INFO("\nSuccess:");
+ INFO(" All tests passed!");
+ }
+ else {
+ INFO("\nWarning:");
+ INFO(" No units %s!", hasUnits ? "matched the filter" : "defined");
+ }
+}
+
+static void BrokenAPI_listAll() {
+ BrokenAPI::Unit* unit = _brokenGlobal._unitList;
+
+ if (unit != NULL) {
+ INFO("Units:");
+ do {
+ INFO(" %s", unit->name);
+ unit = unit->next;
+ } while (unit != NULL);
+ }
+ else {
+ INFO("Warning:");
+ INFO(" No units defined!");
+ }
+}
+
+void BrokenAPI::add(Unit* unit) {
+ Unit** pPrev = &_brokenGlobal._unitList;
+ Unit* current = *pPrev;
+
+ // C++ static initialization doesn't guarantee anything. We sort all units by
+ // name so the execution will always happen in deterministic order.
+ while (current != NULL) {
+ if (::strcmp(current->name, unit->name) >= 0)
+ break;
+
+ pPrev = &current->next;
+ current = *pPrev;
+ }
+
+ *pPrev = unit;
+ unit->next = current;
+}
+
+void BrokenAPI::setOutputFile(FILE* file) {
+ BrokenGlobal& global = _brokenGlobal;
+
+ global._file = file;
+}
+
+int BrokenAPI::setContext(const char* file, int line) {
+ BrokenGlobal& global = _brokenGlobal;
+
+ global._currentFile = file;
+ global._currentLine = line;
+
+ return 1;
+}
+
+int BrokenAPI::run(int argc, const char* argv[],
+ Entry onBeforeRun,
+ Entry onAfterRun) {
+
+ BrokenGlobal& global = _brokenGlobal;
+
+ global._argc = argc;
+ global._argv = argv;
+
+ if (global.hasArg("--help")) {
+ INFO("Options:");
+ INFO(" --help - print this usage");
+ INFO(" --list - list all tests");
+ INFO(" --run-... - run a test(s), trailing wildcards supported");
+ INFO(" --run-all - run all tests");
+ return 0;
+ }
+
+ if (global.hasArg("--list")) {
+ BrokenAPI_listAll();
+ return 0;
+ }
+
+ if (onBeforeRun)
+ onBeforeRun();
+
+ // We don't care about filters here, it's implemented by `runAll`.
+ BrokenAPI_runAll();
+
+ if (onAfterRun)
+ onAfterRun();
+
+ return 0;
+}
+
+int BrokenAPI::info(const char* fmt, ...) {
+ BrokenGlobal& global = _brokenGlobal;
+ FILE* dst = global.getFile();
+
+ const char* prefix = global._unitRunning ? " " : "";
+ size_t len = ::strlen(fmt);
+
+ if (len != 0) {
+ va_list ap;
+ va_start(ap, fmt);
+ ::fputs(prefix, dst);
+ ::vfprintf(dst, fmt, ap);
+ va_end(ap);
+ }
+
+ if (len == 0 || fmt[len - 1] != '\n')
+ ::fputs("\n", dst);
+
+ ::fflush(dst);
+ return 1;
+}
+
+int BrokenAPI::fail(const char* fmt, va_list ap) {
+ BrokenGlobal& global = _brokenGlobal;
+ FILE* dst = global.getFile();
+
+ ::fputs(" Failed!", dst);
+ if (fmt == NULL)
+ fmt = "";
+
+ size_t len = ::strlen(fmt);
+ if (len != 0) {
+ ::fputs(" ", dst);
+ ::vfprintf(dst, fmt, ap);
+ }
+
+ if (len > 0 && fmt[len - 1] != '\n')
+ ::fputs("\n", dst);
+
+ ::fprintf(dst, " File: %s (Line: %d)\n", global._currentFile, global._currentLine);
+ ::fflush(dst);
+
+ ::exit(1);
+ return 1;
+}
diff --git a/test/broken.h b/test/broken.h
new file mode 100644
index 0000000..ad6a058
--- /dev/null
+++ b/test/broken.h
@@ -0,0 +1,145 @@
+// [Broken]
+// Lightweight Unit Testing for C++.
+//
+// [License]
+// Public Domain (Unlicense)
+
+// [Guard]
+#ifndef BROKEN_INTERNAL_H
+#define BROKEN_INTERNAL_H
+
+// [Dependencies]
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// ============================================================================
+// [Broken - Detection]
+// ============================================================================
+
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+# if (__GNUC__ * 1000 + __GNUC_MINOR__) >= 3004
+# define BROKEN_NOINLINE __attribute__((__noinline__))
+# endif
+#elif defined(__clang__)
+# if __has_attribute(__noinline__)
+# define BROKEN_NOINLINE __attribute__((__noinline__))
+# endif
+#elif defined(_MSC_VER)
+# define __declspec(noinline)
+#endif
+
+#if !defined(BROKEN_NOINLINE)
+# define BROKEN_NOINLINE
+#endif
+
+// Hide everything when using Doxygen. Ideally this can be protected by a macro,
+// but there is not globally and widely used one across multiple projects.
+
+//! \internal
+//! \{
+
+// ============================================================================
+// [Broken - API]
+// ============================================================================
+
+struct BrokenAPI {
+ //! Entry point of a unit test defined by `UNIT` macro.
+ typedef void (*Entry)(void);
+
+ //! Test defined by `UNIT` macro.
+ struct Unit {
+ const char* name;
+ Entry entry;
+ size_t finished;
+ Unit* next;
+ };
+
+ //! Automatic unit registration by using static initialization.
+ struct AutoUnit : Unit {
+ inline AutoUnit(const char* _name, Entry _entry) {
+ name = _name;
+ entry = _entry;
+ finished = false;
+ next = NULL;
+
+ BrokenAPI::add(this);
+ }
+ };
+
+ //! Register a new unit test (called automatically by `AutoUnit` and `UNIT`).
+ static void add(Unit* unit);
+
+ //! Set output file to a `file`.
+ static void setOutputFile(FILE* file);
+
+ //! Set the current context to `file` and `line`.
+ //!
+ //! This is called by `EXPECT` macro to set the correct `file` and `line`,
+ //! because `EXPECT` macro internally calls `expect()` function, which does
+ //! change the original file & line to non-interesting `broken.h`.
+ static int setContext(const char* file, int line);
+
+ //! Initialize `Broken` framework.
+ //!
+ //! Returns `true` if `run()` should be called.
+ static int run(int argc, const char* argv[],
+ Entry onBeforeRun = (Entry)NULL,
+ Entry onAfterRun = (Entry)NULL);
+
+ //! Used internally by `EXPECT` macro.
+ template<typename T>
+ BROKEN_NOINLINE static int expect(const T& exp, const char* fmt = NULL, ...) {
+ if (exp)
+ return 1;
+
+ va_list ap;
+ va_start(ap, fmt);
+ fail(fmt, ap);
+ va_end(ap);
+ return 0;
+ }
+
+ //! Log message, adds automatically new line if not present.
+ static int info(const char* fmt, ...);
+ //! Called on `EXPECT()` failure.
+ static int fail(const char* fmt, va_list ap);
+};
+
+// ============================================================================
+// [Broken - Macros]
+// ============================================================================
+
+//! Define a unit test.
+//!
+//! `_Name_` can only contain ASCII characters, numbers and underscore. It has
+//! the same rules as identifiers in C and C++.
+#define UNIT(_Name_) \
+ static void unit_##_Name_##_entry(void); \
+ \
+ static ::BrokenAPI::AutoUnit unit_##_Name_##_autoinit( \
+ #_Name_, unit_##_Name_##_entry); \
+ \
+ static void unit_##_Name_##_entry(void)
+
+//! #define INFO(...)
+//!
+//! Informative message printed to `stdout`.
+#define INFO ::BrokenAPI::setContext(__FILE__, __LINE__) && ::BrokenAPI::info
+
+//! #define INFO(_Exp_ [, _Format_ [, ...]])
+//!
+//! Expect `_Exp_` to be true or evaluates to true, fail otherwise.
+#define EXPECT ::BrokenAPI::setContext(__FILE__, __LINE__) && ::BrokenAPI::expect
+
+// ============================================================================
+// [Broken - Cleanup]
+// ============================================================================
+
+#undef BROKEN_NOINLINE
+
+//! \}
+
+// [Guard]
+#endif // BROKEN_INTERNAL_H