Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/luajit-rocks.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRonan Collobert <locronan@fb.com>2015-11-24 00:15:07 +0300
committerRonan Collobert <locronan@fb.com>2015-11-24 00:15:07 +0300
commit320a2b2b39e8b01a632d2bf5411eee0c0b3aef4f (patch)
tree6a2d5b8ca6b6a62d59636bd6f10d64fa313a08b1
parent2f41c04ac3fe7863308b05f3aeaad9bed43fe0a6 (diff)
Squashed 'luajit-2.1/' changes from 361827c..3a0b2a9
3a0b2a9 x64/LJ_GC64: Fix stack growth in vararg function setup. 4f87367 Document minimum compiler requirements for 2.1 branch. 0aa337a Merge branch 'master' into v2.1 367cba2 DynASM/x86: Add rdpmc instruction. 7991a66 Extend Valgrind suppressions for non-inlined C code. 0029110 Update changelog. 42b9c98 iOS/ARM64: Add build instructions. 3ad7734 Merge branch 'master' into v2.1 1393b2f Update build instructions. f416cf6 OSX: Switch to Clang as the default compiler. ffb124e Merge branch 'master' into v2.1 3639ff4 iOS: Disable os.execute() when building for iOS >= 8.0. 6a54411 x86/x64: Disassemble AVX/AVX2 instructions. 4c08158 Whitespace. d62dc01 DynASM: Bump version to 1.4. 8a13c9c DynASM/x86: Add AVX and AVX2 opcodes. 7e22082 DynASM/x86: Add AES-NI opcodes. e54ca42 Merge branch 'master' into v2.1 7579b16 DynASM/x86: Restrict shld/shrd to operands with same width. 9dc785e DynASM/x86: Fix some SSE instruction templates. efaf24f Fix pairs() recording. b20642c Merge branch 'master' into v2.1 e5f4cb1 FFI: Fix ipairs() recording. 776c693 Merge branch 'master' into v2.1 4808e62 Drop marks from replayed instructions when sinking. 22a9ed8 Merge branch 'master' into v2.1 2e85af8 Flush symbol tables in jit.dump on trace flush. 86c21bd Add collectgarbage("isrunning"). 0b09c97 Merge branch 'master' into v2.1 ef087aa Fix unsinking check. 52ea1a3 FFI: Add ssize_t declaration. 49427df Merge branch 'master' into v2.1 076d625 Fix Cygwin support. Note: this is not a supported target. 55c3b29 Parse Unicode string escape \u{XX...}. a3a6866 Re-enable trace stitching. ad29314 ARM64: Fix ELF bytecode saving. fb77f7d RELEASE LuaJIT-2.1.0-beta1 0a3cd94 ARM64: Add support for saving bytecode as object files. ba617df Update changelog. 718af62 Add note about bytecode compatibility wrt. LJ_GC64/LJ_FR2. 9d18567 Remove LuaJIT 1.x change history. ca78889 Merge branch 'master' into v2.1 8715ae6 Properly handle OOM in trace_save(). b86fc2b Merge branch 'master' into v2.1 424940f Fix NYICF error message. eb050f9 Parse binary number literals (0bxxx). 0dcd2d1 Merge branch 'master' into v2.1 a5b1c4d FFI: Don't propagate qualifiers into subtypes of complex. ba6344d ARM64: Fix __call metamethod handling for tail calls. 5874c21 Merge branch 'master' into v2.1 575bfe5 Limit number of arguments given to io.lines() and fp:lines(). 28d8728 Fix Xbox One port. fe56522 Disable table allocation bump optimization (for now). 26b95a9 Add Xbox One port. 458a40b Fix table allocation bump optimization (sigh). 5081e38 Merge branch 'master' into v2.1 7f454ae Fix narrowing of TOBIT. 08e100e x64/LJ_GC64: Fix ipairs(). c5d262f Fix table allocation bump optimization (again). fb19df9 Merge branch 'master' into v2.1 7234789 x64: Allow building without external unwinder. 4da1bb6 x64/LJ_GC64: Fix upvalue write barrier. 5de95ed x64/LJ_GC64: Fix setmetatable(). 60fb3fe Fix table allocation bump optimization. 4a146ca LJ_GC64: Fix frame unwinding. bca5da8 Fix for last commit. b82fc3d Bump table allocations retroactively if they grow later on. d8cfc37 Merge branch 'master' into v2.1 69e5342 RELEASE LuaJIT-2.0.4 cd4c59f Update changelog. a836b7e Cleanup. 4cec6a5 Merge branch 'master' into v2.1 9622d6b x86/x64: Fix argument check for bit shifts. a92e730 x64: Add LJ_GC64 mode interpreter. 5caf53d Fix broken Windows build. c2924c3 Disable trace stitching (for now) due to a design mistake. 74caac9 Merge branch 'master' into v2.1 11106aa Fix stack check in narrowing optimization. 1e67a4c ARM64: Fix math.floor/math.ceil for string args. 1c968d5 DynASM/PPC: Add sub/shift/rotate/clear instruction aliases. b2a5cc8 DynASM/PPC: Add support for parameterized shifts/masks. dec4acc PPC: Fix cross-endian builds. 8002871 PPC: Fix write barrier in BC_TSETR. 4e52d1f Merge branch 'master' into v2.1 a9fd686 Doc cleanup. git-subtree-dir: luajit-2.1 git-subtree-split: 3a0b2a907b0168be8ef7aade95bd2ec3991fa78c
-rw-r--r--Makefile2
-rw-r--r--README2
-rw-r--r--doc/changes.html337
-rw-r--r--doc/ext_ffi_api.html4
-rw-r--r--doc/ext_ffi_semantics.html2
-rw-r--r--doc/extensions.html14
-rw-r--r--doc/install.html52
-rw-r--r--doc/luajit.html2
-rw-r--r--doc/running.html2
-rw-r--r--doc/status.html13
-rw-r--r--dynasm/dasm_arm.lua6
-rw-r--r--dynasm/dasm_arm64.lua6
-rw-r--r--dynasm/dasm_mips.lua6
-rw-r--r--dynasm/dasm_ppc.h11
-rw-r--r--dynasm/dasm_ppc.lua108
-rw-r--r--dynasm/dasm_proto.h4
-rw-r--r--dynasm/dasm_x86.h7
-rw-r--r--dynasm/dasm_x86.lua428
-rw-r--r--dynasm/dynasm.lua6
-rw-r--r--etc/luajit.pc2
-rw-r--r--src/Makefile58
-rw-r--r--src/jit/bcsave.lua16
-rw-r--r--src/jit/dis_x86.lua225
-rw-r--r--src/jit/dump.lua1
-rw-r--r--src/lib_base.c6
-rw-r--r--src/lib_io.c19
-rw-r--r--src/lib_os.c2
-rw-r--r--src/lib_package.c10
-rw-r--r--src/lj.supp15
-rw-r--r--src/lj_alloc.c2
-rw-r--r--src/lj_api.c3
-rw-r--r--src/lj_arch.h22
-rw-r--r--src/lj_ccallback.c7
-rw-r--r--src/lj_cdata.c7
-rw-r--r--src/lj_clib.c16
-rw-r--r--src/lj_cparse.c4
-rw-r--r--src/lj_ctype.c2
-rw-r--r--src/lj_err.c14
-rw-r--r--src/lj_ffrecord.c16
-rw-r--r--src/lj_frame.h20
-rw-r--r--src/lj_gc.c2
-rw-r--r--src/lj_gdbjit.c4
-rw-r--r--src/lj_ir.c44
-rw-r--r--src/lj_iropt.h1
-rw-r--r--src/lj_jit.h18
-rw-r--r--src/lj_lex.c27
-rw-r--r--src/lj_opt_narrow.c21
-rw-r--r--src/lj_profile.c2
-rw-r--r--src/lj_record.c102
-rw-r--r--src/lj_snap.c32
-rw-r--r--src/lj_strscan.c57
-rw-r--r--src/lj_tab.c6
-rw-r--r--src/lj_tab.h1
-rw-r--r--src/lj_target_x86.h4
-rw-r--r--src/lj_trace.c35
-rw-r--r--src/lj_traceerr.h1
-rw-r--r--src/lua.h1
-rw-r--r--src/luaconf.h2
-rw-r--r--src/luajit.h4
-rw-r--r--src/ps4build.bat4
-rw-r--r--src/vm_arm.dasc13
-rw-r--r--src/vm_arm64.dasc5
-rw-r--r--src/vm_mips.dasc10
-rw-r--r--src/vm_ppc.dasc19
-rw-r--r--src/vm_x64.dasc4902
-rw-r--r--src/vm_x86.dasc47
-rw-r--r--src/xb1build.bat101
67 files changed, 6327 insertions, 617 deletions
diff --git a/Makefile b/Makefile
index dce52e5..8ce773e 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@
MAJVER= 2
MINVER= 1
RELVER= 0
-PREREL= -alpha
+PREREL= -beta1
VERSION= $(MAJVER).$(MINVER).$(RELVER)$(PREREL)
ABIVER= 5.1
diff --git a/README b/README
index 6dcd9b9..ca70dd8 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for LuaJIT 2.1.0-alpha
+README for LuaJIT 2.1.0-beta1
-----------------------------
LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language.
diff --git a/doc/changes.html b/doc/changes.html
index 0d3e5ae..826cd24 100644
--- a/doc/changes.html
+++ b/doc/changes.html
@@ -65,7 +65,7 @@ div.major { max-width: 600px; padding: 1em; margin: 1em 0 1em 0; }
<div id="main">
<p>
This is a list of changes between the released versions of LuaJIT.<br>
-The current <span style="color: #0000c0;">stable version</span> is <strong>LuaJIT&nbsp;2.0.3</strong>.<br>
+The current <span style="color: #0000c0;">stable version</span> is <strong>LuaJIT&nbsp;2.0.4</strong>.<br>
</p>
<p>
Please check the
@@ -74,6 +74,112 @@ to see whether newer versions are available.
</p>
<div class="major" style="background: #d0d0ff;">
+<h2 id="LuaJIT-2.1.0-beta1">LuaJIT 2.1.0-beta1 &mdash; 2015-08-25</h2>
+<p>
+This is a brief summary of the major changes in LuaJIT 2.1 compared to 2.0.
+Please take a look at the commit history for more details.
+</p>
+<ul>
+<li>Changes to the VM core:
+<ul>
+<li>Add low-overhead profiler (<tt>-jp</tt>).</li>
+<li>Add <tt>LJ_GC64</tt> mode: 64 bit GC object references (really: 47 bit). Interpreter-only for now.</li>
+<li>Add <tt>LJ_FR2</tt> mode: Two-slot frame info. Required by <tt>LJ_GC64</tt> mode.</li>
+<li>Add <tt>table.new()</tt> and <tt>table.clear()</tt>.</li>
+<li>Parse Unicode escape <tt>'\u{XX...}'</tt> in string literals.</li>
+<li>Parse binary number literals (<tt>0bxxx</tt>).</li>
+</ul></li>
+<li>Improvements to the JIT compiler:
+<ul>
+<li>Add trace stitching.</li>
+<li>Compile various builtins: <tt>string.char()</tt>, <tt>string.reverse()</tt>, <tt>string.lower()</tt>, <tt>string.upper()</tt>, <tt>string.rep()</tt>, <tt>string.format()</tt>, <tt>table.concat()</tt>, <tt>bit.tohex()</tt>, <tt>getfenv(0)</tt>, <tt>debug.getmetatable()</tt>.</li>
+<li>Compile <tt>string.find()</tt> for fixed string searches (no patterns).</li>
+<li>Compile <tt>BC_TSETM</tt>, e.g. <tt>{1,2,3,f()}</tt>.</li>
+<li>Compile string concatenations (<tt>BC_CAT</tt>).</li>
+<li>Compile <tt>__concat</tt> metamethod.</li>
+<li>Various minor optimizations.</li>
+</ul></li>
+<li>Internal Changes:
+<ul>
+<li>Add support for embedding LuaJIT bytecode for builtins.</li>
+<li>Replace various builtins with embedded bytecode.</li>
+<li>Refactor string buffers and string formatting.</li>
+<li>Remove obsolete non-truncating number to integer conversions.</li>
+</ul></li>
+<li>Ports:
+<ul>
+<li>Add Xbox One port (<tt>LJ_GC64</tt> mode).</li>
+<li>ARM64: Add port of the interpreter (<tt>LJ_GC64</tt> mode).</li>
+<li>x64: Add separate port of the interpreter to <tt>LJ_GC64</tt> mode.</li>
+<li>x86/x64: Drop internal x87 math functions. Use libm functions.</li>
+<li>x86: Remove x87 support from interpreter. SSE2 is mandatory now.</li>
+<li>x86/x64: Add support for AES-NI, AVX and AVX2 to DynASM.</li>
+<li>PPC/e500: Drop support for this architecture.</li>
+</ul></li>
+<li>FFI library:
+<ul>
+<li>FFI: Add 64 bit bitwise operations.</li>
+<li>FFI: Compile VLA/VLS and large cdata allocations with default initialization.</li>
+<li>FFI: Compile conversions from functions to function pointers.</li>
+<li>FFI: Compile lightuserdata to <tt>void *</tt> conversion.</li>
+<li>FFI: Compile <tt>ffi.gc(cdata, nil)</tt>, too.</li>
+<li>FFI: Add <tt>ffi.typeinfo()</tt>.</li>
+<li>FFI: Add <tt>ssize_t</tt> declaration.</li>
+</ul></li>
+</ul>
+</div>
+
+<div class="major" style="background: #ffffd0;">
+<h2 id="LuaJIT-2.0.4">LuaJIT 2.0.4 &mdash; 2015-05-14</h2>
+<ul>
+<li>Fix stack check in narrowing optimization.</li>
+<li>Fix Lua/C API typecheck error for special indexes.</li>
+<li>Fix string to number conversion.</li>
+<li>Fix lexer error for chunks without tokens.</li>
+<li>Don't compile <tt>IR_RETF</tt> after <tt>CALLT</tt> to ff with-side effects.</li>
+<li>Fix <tt>BC_UCLO</tt>/<tt>BC_JMP</tt> join optimization in Lua parser.</li>
+<li>Fix corner case in string to number conversion.</li>
+<li>Gracefully handle <tt>lua_error()</tt> for a suspended coroutine.</li>
+<li>Avoid error messages when building with Clang.</li>
+<li>Fix snapshot #0 handling for traces with a stack check on entry.</li>
+<li>Fix fused constant loads under high register pressure.</li>
+<li>Invalidate backpropagation cache after DCE.</li>
+<li>Fix ABC elimination.</li>
+<li>Fix debug info for main chunk of stripped bytecode.</li>
+<li>Fix FOLD rule for <tt>string.sub(s, ...) == k</tt>.</li>
+<li>Fix FOLD rule for <tt>STRREF</tt> of <tt>SNEW</tt>.</li>
+<li>Fix frame traversal while searching for error function.</li>
+<li>Prevent GC estimate miscalculation due to buffer growth.</li>
+<li>Prevent adding side traces for stack checks.</li>
+<li>Fix top slot calculation for snapshots with continuations.</li>
+<li>Fix check for reuse of SCEV results in <tt>FORL</tt>.</li>
+<li>Add PS Vita port.</li>
+<li>Fix compatibility issues with Illumos.</li>
+<li>Fix DragonFly build (unsupported).</li>
+<li>OpenBSD/x86: Better executable memory allocation for W^X mode.</li>
+<li>x86: Fix argument checks for <tt>ipairs()</tt> iterator.</li>
+<li>x86: <tt>lj_math_random_step()</tt> clobbers XMM regs on OSX Clang.</li>
+<li>x86: Fix code generation for unused result of <tt>math.random()</tt>.</li>
+<li>x64: Allow building with <tt>LUAJIT_USE_SYSMALLOC</tt> and <tt>LUAJIT_USE_VALGRIND</tt>.</li>
+<li>x86/x64: Fix argument check for bit shifts.</li>
+<li>x86/x64: Fix code generation for fused test/arith ops.</li>
+<li>ARM: Fix write barrier check in <tt>BC_USETS</tt>.</li>
+<li>PPC: Fix red zone overflow in machine code generation.</li>
+<li>PPC: Don't use <tt>mcrxr</tt> on PPE.</li>
+<li>Various archs: Fix excess stack growth in interpreter.</li>
+<li>FFI: Fix FOLD rule for <tt>TOBIT</tt> + <tt>CONV num.u32</tt>.</li>
+<li>FFI: Prevent DSE across <tt>ffi.string()</tt>.</li>
+<li>FFI: No meta fallback when indexing pointer to incomplete struct.</li>
+<li>FFI: Fix initialization of unions of subtypes.</li>
+<li>FFI: Fix cdata vs. non-cdata arithmetic and comparisons.</li>
+<li>FFI: Fix <tt>__index</tt>/<tt>__newindex</tt> metamethod resolution for ctypes.</li>
+<li>FFI: Fix compilation of reference field access.</li>
+<li>FFI: Fix frame traversal for backtraces with FFI callbacks.</li>
+<li>FFI: Fix recording of indexing a struct pointer ctype object itself.</li>
+<li>FFI: Allow non-scalar cdata to be compared for equality by address.</li>
+<li>FFI: Fix pseudo type conversions for type punning.</li>
+</ul>
+
<h2 id="LuaJIT-2.0.3">LuaJIT 2.0.3 &mdash; 2014-03-12</h2>
<ul>
<li>Add PS4 port.</li>
@@ -687,235 +793,6 @@ This matches the behavior of Lua 5.1, but not the specification.</li>
no point in listing differences over earlier versions.</li>
</ul>
</div>
-
-<div class="major" style="background: #ffff80;">
-<h2 id="LuaJIT-1.1.8">LuaJIT 1.1.8 &mdash; 2012-04-16</h2>
-<ul>
-<li>Merged with Lua 5.1.5. Also integrated fixes for all
-<a href="http://www.lua.org/bugs.html#5.1.5"><span class="ext">&raquo;</span>&nbsp;<span class="ext">&raquo;</span>&nbsp;currently known bugs in Lua 5.1.5</a>.</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.7">LuaJIT 1.1.7 &mdash; 2011-05-05</h2>
-<ul>
-<li>Added fixes for the
-<a href="http://www.lua.org/bugs.html#5.1.4"><span class="ext">&raquo;</span>&nbsp;currently known bugs in Lua 5.1.4</a>.</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.6">LuaJIT 1.1.6 &mdash; 2010-03-28</h2>
-<ul>
-<li>Added fixes for the
-<a href="http://www.lua.org/bugs.html#5.1.4"><span class="ext">&raquo;</span>&nbsp;currently known bugs in Lua 5.1.4</a>.</li>
-<li>Removed wrong GC check in <tt>jit_createstate()</tt>.
-Thanks to Tim Mensch.</li>
-<li>Fixed bad assertions while compiling <tt>table.insert()</tt> and
-<tt>table.remove()</tt>.</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.5">LuaJIT 1.1.5 &mdash; 2008-10-25</h2>
-<ul>
-<li>Merged with Lua 5.1.4. Fixes all
-<a href="http://www.lua.org/bugs.html#5.1.3"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1.3</a>.</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.4">LuaJIT 1.1.4 &mdash; 2008-02-05</h2>
-<ul>
-<li>Merged with Lua 5.1.3. Fixes all
-<a href="http://www.lua.org/bugs.html#5.1.2"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1.2</a>.</li>
-<li>Fixed possible (but unlikely) stack corruption while compiling
-<tt>k^x</tt> expressions.</li>
-<li>Fixed DynASM template for cmpss instruction.</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.3">LuaJIT 1.1.3 &mdash; 2007-05-24</h2>
-<ul>
-<li>Merged with Lua 5.1.2. Fixes all
-<a href="http://www.lua.org/bugs.html#5.1.1"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1.1</a>.</li>
-<li>Merged pending Lua 5.1.x fixes: "return -nil" bug, spurious count hook call.</li>
-<li>Remove a (sometimes) wrong assertion in <tt>luaJIT_findpc()</tt>.</li>
-<li>DynASM now allows labels for displacements and <tt>.aword</tt>.</li>
-<li>Fix some compiler warnings for DynASM glue (internal API change).</li>
-<li>Correct naming for SSSE3 (temporarily known as SSE4) in DynASM and x86 disassembler.</li>
-<li>The loadable debug modules now handle redirection to stdout
-(e.g. <tt>-j&nbsp;trace=-</tt>).</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.2">LuaJIT 1.1.2 &mdash; 2006-06-24</h2>
-<ul>
-<li>Fix MSVC inline assembly: use only local variables with
-<tt>lua_number2int()</tt>.</li>
-<li>Fix "attempt to call a thread value" bug on Mac OS X:
-make values of consts used as lightuserdata keys unique
-to avoid joining by the compiler/linker.</li>
-</ul>
-
-<h2 id="LuaJIT-1.1.1">LuaJIT 1.1.1 &mdash; 2006-06-20</h2>
-<ul>
-<li>Merged with Lua 5.1.1. Fixes all
-<a href="http://www.lua.org/bugs.html#5.1"><span class="ext">&raquo;</span>&nbsp;known bugs in Lua 5.1</a>.</li>
-<li>Enforce (dynamic) linker error for EXE/DLL version mismatches.</li>
-<li>Minor changes to DynASM: faster pre-processing, smaller encoding
-for some immediates.</li>
-</ul>
-<p>
-This release is in sync with Coco 1.1.1 (see the
-<a href="http://coco.luajit.org/changes.html"><span class="ext">&raquo;</span>&nbsp;Coco Change History</a>).
-</p>
-
-<h2 id="LuaJIT-1.1.0">LuaJIT 1.1.0 &mdash; 2006-03-13</h2>
-<ul>
-<li>Merged with Lua 5.1 (final).</li>
-
-<li>New JIT call frame setup:
-<ul>
-<li>The C stack is kept 16 byte aligned (faster).
-Mandatory for Mac OS X on Intel, too.</li>
-<li>Faster calling conventions for internal C helper functions.</li>
-<li>Better instruction scheduling for function prologue, OP_CALL and
-OP_RETURN.</li>
-</ul></li>
-
-<li>Miscellaneous optimizations:
-<ul>
-<li>Faster loads of FP constants. Remove narrow-to-wide store-to-load
-forwarding stalls.</li>
-<li>Use (scalar) SSE2 ops (if the CPU supports it) to speed up slot moves
-and FP to integer conversions.</li>
-<li>Optimized the two-argument form of <tt>OP_CONCAT</tt> (<tt>a..b</tt>).</li>
-<li>Inlined <tt>OP_MOD</tt> (<tt>a%b</tt>).
-With better accuracy than the C variant, too.</li>
-<li>Inlined <tt>OP_POW</tt> (<tt>a^b</tt>). Unroll <tt>x^k</tt> or
-use <tt>k^x = 2^(log2(k)*x)</tt> or call <tt>pow()</tt>.</li>
-</ul></li>
-
-<li>Changes in the optimizer:
-<ul>
-<li>Improved hinting for table keys derived from table values
-(<tt>t1[t2[x]]</tt>).</li>
-<li>Lookup hinting now works with arbitrary object types and
-supports index chains, too.</li>
-<li>Generate type hints for arithmetic and comparison operators,
-OP_LEN, OP_CONCAT and OP_FORPREP.</li>
-<li>Remove several hint definitions in favour of a generic COMBINE hint.</li>
-<li>Complete rewrite of <tt>jit.opt_inline</tt> module
-(ex <tt>jit.opt_lib</tt>).</li>
-</ul></li>
-
-<li>Use adaptive deoptimization:
-<ul>
-<li>If runtime verification of a contract fails, the affected
-instruction is recompiled and patched on-the-fly.
-Regular programs will trigger deoptimization only occasionally.</li>
-<li>This avoids generating code for uncommon fallback cases
-most of the time. Generated code is up to 30% smaller compared to
-LuaJIT&nbsp;1.0.3.</li>
-<li>Deoptimization is used for many opcodes and contracts:
-<ul>
-<li>OP_CALL, OP_TAILCALL: type mismatch for callable.</li>
-<li>Inlined calls: closure mismatch, parameter number and type mismatches.</li>
-<li>OP_GETTABLE, OP_SETTABLE: table or key type and range mismatches.</li>
-<li>All arithmetic and comparison operators, OP_LEN, OP_CONCAT,
-OP_FORPREP: operand type and range mismatches.</li>
-</ul></li>
-<li>Complete redesign of the debug and traceback info
-(bytecode &harr; mcode) to support deoptimization.
-Much more flexible and needs only 50% of the space.</li>
-<li>The modules <tt>jit.trace</tt>, <tt>jit.dumphints</tt> and
-<tt>jit.dump</tt> handle deoptimization.</li>
-</ul></li>
-
-<li>Inlined many popular library functions
-(for commonly used arguments only):
-<ul>
-<li>Most <tt>math.*</tt> functions (the 18 most used ones)
-[2x-10x faster].</li>
-<li><tt>string.len</tt>, <tt>string.sub</tt> and <tt>string.char</tt>
-[2x-10x faster].</li>
-<li><tt>table.insert</tt>, <tt>table.remove</tt> and <tt>table.getn</tt>
-[3x-5x faster].</li>
-<li><tt>coroutine.yield</tt> and <tt>coroutine.resume</tt>
-[3x-5x faster].</li>
-<li><tt>pairs</tt>, <tt>ipairs</tt> and the corresponding iterators
-[8x-15x faster].</li>
-</ul></li>
-
-<li>Changes in the core and loadable modules and the stand-alone executable:
-<ul>
-<li>Added <tt>jit.version</tt>, <tt>jit.version_num</tt>
-and <tt>jit.arch</tt>.</li>
-<li>Reorganized some internal API functions (<tt>jit.util.*mcode*</tt>).</li>
-<li>The <tt>-j dump</tt> output now shows JSUB names, too.</li>
-<li>New x86 disassembler module written in pure Lua. No dependency
-on ndisasm anymore. Flexible API, very compact (500 lines)
-and complete (x87, MMX, SSE, SSE2, SSE3, SSSE3, privileged instructions).</li>
-<li><tt>luajit -v</tt> prints the LuaJIT version and copyright
-on a separate line.</li>
-</ul></li>
-
-<li>Added SSE, SSE2, SSE3 and SSSE3 support to DynASM.</li>
-<li>Miscellaneous doc changes. Added a section about
-<a href="install.html#embedding">embedding LuaJIT</a>.</li>
-</ul>
-<p>
-This release is in sync with Coco 1.1.0 (see the
-<a href="http://coco.luajit.org/changes.html"><span class="ext">&raquo;</span>&nbsp;Coco Change History</a>).
-</p>
-</div>
-
-<div class="major" style="background: #ffffd0;">
-<h2 id="LuaJIT-1.0.3">LuaJIT 1.0.3 &mdash; 2005-09-08</h2>
-<ul>
-<li>Even more docs.</li>
-<li>Unified closure checks in <tt>jit.*</tt>.</li>
-<li>Fixed some range checks in <tt>jit.util.*</tt>.</li>
-<li>Fixed __newindex call originating from <tt>jit_settable_str()</tt>.</li>
-<li>Merged with Lua 5.1 alpha (including early bug fixes).</li>
-</ul>
-<p>
-This is the first public release of LuaJIT.
-</p>
-
-<h2 id="LuaJIT-1.0.2">LuaJIT 1.0.2 &mdash; 2005-09-02</h2>
-<ul>
-<li>Add support for flushing the Valgrind translation cache <br>
-(<tt>MYCFLAGS= -DUSE_VALGRIND</tt>).</li>
-<li>Add support for freeing executable mcode memory to the <tt>mmap()</tt>-based
-variant for POSIX systems.</li>
-<li>Reorganized the C&nbsp;function signature handling in
-<tt>jit.opt_lib</tt>.</li>
-<li>Changed to index-based hints for inlining C&nbsp;functions.
-Still no support in the backend for inlining.</li>
-<li>Hardcode <tt>HEAP_CREATE_ENABLE_EXECUTE</tt> value if undefined.</li>
-<li>Misc. changes to the <tt>jit.*</tt> modules.</li>
-<li>Misc. changes to the Makefiles.</li>
-<li>Lots of new docs.</li>
-<li>Complete doc reorg.</li>
-</ul>
-<p>
-Not released because Lua 5.1 alpha came out today.
-</p>
-
-<h2 id="LuaJIT-1.0.1">LuaJIT 1.0.1 &mdash; 2005-08-31</h2>
-<ul>
-<li>Missing GC step in <tt>OP_CONCAT</tt>.</li>
-<li>Fix result handling for C &ndash;> JIT calls.</li>
-<li>Detect CPU feature bits.</li>
-<li>Encode conditional moves (<tt>fucomip</tt>) only when supported.</li>
-<li>Add fallback instructions for FP compares.</li>
-<li>Add support for <tt>LUA_COMPAT_VARARG</tt>. Still disabled by default.</li>
-<li>MSVC needs a specific place for the <tt>CALLBACK</tt> attribute
-(David Burgess).</li>
-<li>Misc. doc updates.</li>
-</ul>
-<p>
-Interim non-public release.
-Special thanks to Adam D. Moss for reporting most of the bugs.
-</p>
-
-<h2 id="LuaJIT-1.0.0">LuaJIT 1.0.0 &mdash; 2005-08-29</h2>
-<p>
-This is the initial non-public release of LuaJIT.
-</p>
-</div>
<br class="flush">
</div>
<div id="foot">
diff --git a/doc/ext_ffi_api.html b/doc/ext_ffi_api.html
index 8cf48dc..b095c05 100644
--- a/doc/ext_ffi_api.html
+++ b/doc/ext_ffi_api.html
@@ -546,8 +546,8 @@ corresponding ctype.
The parser for Lua source code treats numeric literals with the
suffixes <tt>LL</tt> or <tt>ULL</tt> as signed or unsigned 64&nbsp;bit
integers. Case doesn't matter, but uppercase is recommended for
-readability. It handles both decimal (<tt>42LL</tt>) and hexadecimal
-(<tt>0x2aLL</tt>) literals.
+readability. It handles decimal (<tt>42LL</tt>), hexadecimal
+(<tt>0x2aLL</tt>) and binary (<tt>0b101010LL</tt>) literals.
</p>
<p>
The imaginary part of complex numbers can be specified by suffixing
diff --git a/doc/ext_ffi_semantics.html b/doc/ext_ffi_semantics.html
index 889d44d..f65fe8f 100644
--- a/doc/ext_ffi_semantics.html
+++ b/doc/ext_ffi_semantics.html
@@ -185,6 +185,8 @@ a <tt>typedef</tt>, except re-declarations will be ignored):
<tt>uint16_t</tt>, <tt>uint32_t</tt>, <tt>uint64_t</tt>,
<tt>intptr_t</tt>, <tt>uintptr_t</tt>.</li>
+<li>From <tt>&lt;unistd.h&gt;</tt> (POSIX): <tt>ssize_t</tt>.</li>
+
</ul>
<p>
You're encouraged to use these types in preference to
diff --git a/doc/extensions.html b/doc/extensions.html
index d2f8d7b..e034e1d 100644
--- a/doc/extensions.html
+++ b/doc/extensions.html
@@ -183,7 +183,7 @@ in <tt>"-inf"</tt>.
<h3 id="tonumber"><tt>tonumber()</tt> etc. use builtin string to number conversion</h3>
<p>
All string-to-number conversions consistently convert integer and
-floating-point inputs in decimal and hexadecimal on all platforms.
+floating-point inputs in decimal, hexadecimal and binary on all platforms.
<tt>strtod()</tt> is <em>not</em> used anymore, which avoids numerous
problems with poor C library implementations. The builtin conversion
function provides full precision according to the IEEE-754 standard, it
@@ -207,6 +207,11 @@ for dot releases (x.y.0 &rarr; x.y.1), but may change with major or
minor releases (2.0 &rarr; 2.1) or between any beta release. Foreign
bytecode (e.g. from Lua 5.1) is incompatible and cannot be loaded.
</p>
+<p>
+Note: <tt>LJ_GC64</tt> mode requires a different frame layout, which implies
+a different, incompatible bytecode format for ports that use this mode (e.g.
+ARM64). This may be rectified in the future.
+</p>
<h3 id="table_new"><tt>table.new(narray, nhash)</tt> allocates a pre-sized table</h3>
<p>
@@ -339,6 +344,13 @@ Lua&nbsp;5.1, which prevents implementing features that would otherwise
break the Lua/C API and ABI (e.g. <tt>_ENV</tt>).
</p>
+<h2 id="lua53">Extensions from Lua 5.3</h2>
+<p>
+LuaJIT supports some extensions from Lua&nbsp;5.3:
+<ul>
+<li>Unicode escape <tt>'\u{XX...}'</tt> embeds the UTF-8 encoding in string literals.</li>
+</ul>
+
<h2 id="exceptions">C++ Exception Interoperability</h2>
<p>
LuaJIT has built-in support for interoperating with C++&nbsp;exceptions.
diff --git a/doc/install.html b/doc/install.html
index 9fe0c08..a4cc721 100644
--- a/doc/install.html
+++ b/doc/install.html
@@ -114,30 +114,30 @@ operating systems, CPUs and compilers:
</tr>
<tr class="odd separate">
<td class="compatcpu">x86 (32 bit)</td>
-<td class="compatos">GCC 4.x<br>GCC 3.4</td>
-<td class="compatos">GCC 4.x<br>GCC 3.4</td>
-<td class="compatos">GCC 4.x<br>GCC 3.4</td>
+<td class="compatos">GCC 4.2+</td>
+<td class="compatos">GCC 4.2+</td>
+<td class="compatos">XCode 5.0+<br>Clang</td>
<td class="compatos">MSVC, MSVC/EE<br>WinSDK<br>MinGW, Cygwin</td>
</tr>
<tr class="even">
<td class="compatcpu">x64 (64 bit)</td>
-<td class="compatos">GCC 4.x</td>
+<td class="compatos">GCC 4.2+</td>
<td class="compatos">ORBIS (<a href="#ps4">PS4</a>)</td>
-<td class="compatos">GCC 4.x</td>
-<td class="compatos">MSVC + SDK v7.0<br>WinSDK v7.0</td>
+<td class="compatos">XCode 5.0+<br>Clang</td>
+<td class="compatos">MSVC + SDK v7.0<br>WinSDK v7.0<br>Durango (<a href="#xboxone">Xbox One</a>)</td>
</tr>
<tr class="odd">
<td class="compatcpu"><a href="#cross2">ARMv5+<br>ARM9E+</a></td>
<td class="compatos">GCC 4.2+</td>
<td class="compatos">GCC 4.2+<br>PSP2 (<a href="#psvita">PS VITA</a>)</td>
-<td class="compatos">GCC 4.2+</td>
+<td class="compatos">XCode 5.0+<br>Clang</td>
<td class="compatos compatno">&nbsp;</td>
</tr>
<tr class="even">
<td class="compatcpu"><a href="#cross2">ARM64</a></td>
<td class="compatos">GCC 4.8+</td>
<td class="compatos compatno">&nbsp;</td>
-<td class="compatos">Clang 3.5+</td>
+<td class="compatos">XCode 6.0+<br>Clang 3.5+</td>
<td class="compatos compatno">&nbsp;</td>
</tr>
<tr class="odd">
@@ -190,8 +190,8 @@ open a terminal window and change to this directory. Now unpack the archive
and change to the newly created directory:
</p>
<pre class="code">
-tar zxf LuaJIT-2.0.3.tar.gz
-cd LuaJIT-2.0.3</pre>
+tar zxf LuaJIT-2.0.4.tar.gz
+cd LuaJIT-2.0.4</pre>
<h3>Building LuaJIT</h3>
<p>
The supplied Makefiles try to auto-detect the settings needed for your
@@ -442,8 +442,7 @@ NDKF="--sysroot $NDK/platforms/android-$NDKABI/arch-x86"
make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF"
</pre>
<p>
-You can cross-compile for <b id="ios">iOS 3.0+</b> (iPhone/iPad) using the <a href="http://developer.apple.com/devcenter/ios/index.action"><span class="ext">&raquo;</span>&nbsp;iOS SDK</a>.
-The environment variables need to match the iOS SDK version:
+You can cross-compile for <b id="ios">iOS 3.0+</b> (iPhone/iPad) using the <a href="http://developer.apple.com/devcenter/ios/index.action"><span class="ext">&raquo;</span>&nbsp;iOS SDK</a>:
</p>
<p style="font-size: 8pt;">
Note: <b>the JIT compiler is disabled for iOS</b>, because regular iOS Apps
@@ -453,13 +452,18 @@ much slower than the JIT compiler. Please complain to Apple, not me.
Or use Android. :-p
</p>
<pre class="code">
-IXCODE=`xcode-select -print-path`
-ISDK=$IXCODE/Platforms/iPhoneOS.platform/Developer
-ISDKVER=iPhoneOS6.0.sdk
-ISDKP=$ISDK/usr/bin/
-ISDKF="-arch armv7 -isysroot $ISDK/SDKs/$ISDKVER"
-make HOST_CC="gcc -m32 -arch i386" CROSS=$ISDKP TARGET_FLAGS="$ISDKF" \
- TARGET_SYS=iOS
+# iOS/ARM (32 bit)
+ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
+ICC=$(xcrun --sdk iphoneos --find clang)
+ISDKF="-arch armv7 -isysroot $ISDKP"
+make HOST_CC="clang -m32 -arch i386" CROSS="$(dirname $ICC)/" \
+ TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
+
+# iOS/ARM64
+ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
+ICC=$(xcrun --sdk iphoneos --find clang)
+ISDKF="-arch arm64 -isysroot $ISDKP"
+make CROSS="$(dirname $ICC)/" TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
</pre>
<h3 id="consoles">Cross-compiling for consoles</h3>
@@ -516,6 +520,16 @@ the following commands:
cd src
xedkbuild
</pre>
+<p>
+To cross-compile for <b id="xboxone">Xbox One</b> from a Windows host,
+open a "Visual Studio .NET Command Prompt" (64&nbsp;bit host compiler),
+<tt>cd</tt> to the directory where you've unpacked the sources and run
+the following commands:
+</p>
+<pre class="code">
+cd src
+xb1build
+</pre>
<h2 id="embed">Embedding LuaJIT</h2>
<p>
diff --git a/doc/luajit.html b/doc/luajit.html
index 721f636..8a653e2 100644
--- a/doc/luajit.html
+++ b/doc/luajit.html
@@ -166,7 +166,7 @@ LuaJIT is Copyright &copy; 2005-2015 Mike Pall, released under the
<tr><td><span style="font-size:90%;">Embedded</span></td><td>Android</td><td>iOS</td></tr>
</table>
<table class="feature os os3">
-<tr><td>PS3</td><td>PS4</td><td>PS Vita</td><td>Xbox 360</td></tr>
+<tr><td>PS3</td><td>PS4</td><td>PS Vita</td><td>Xbox 360</td><td>Xbox One</td></tr>
</table>
<table class="feature compiler">
<tr><td>GCC</td><td>CLANG<br>LLVM</td><td>MSVC</td></tr>
diff --git a/doc/running.html b/doc/running.html
index 2b764dc..5ee67c9 100644
--- a/doc/running.html
+++ b/doc/running.html
@@ -189,7 +189,7 @@ itself. For a description of their options and output format, please
read the comment block at the start of their source.
They can be found in the <tt>lib</tt> directory of the source
distribution or installed under the <tt>jit</tt> directory. By default
-this is <tt>/usr/local/share/luajit-2.0.3/jit</tt> on POSIX
+this is <tt>/usr/local/share/luajit-2.0.4/jit</tt> on POSIX
systems.
</p>
diff --git a/doc/status.html b/doc/status.html
index cf137ee..91ed9cb 100644
--- a/doc/status.html
+++ b/doc/status.html
@@ -1,7 +1,7 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
-<title>Status &amp; Roadmap</title>
+<title>Status</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Mike Pall">
<meta name="Copyright" content="Copyright (C) 2005-2015, Mike Pall">
@@ -17,7 +17,7 @@ ul li { padding-bottom: 0.3em; }
<a href="http://luajit.org"><span>Lua<span id="logo">JIT</span></span></a>
</div>
<div id="head">
-<h1>Status &amp; Roadmap</h1>
+<h1>Status</h1>
</div>
<div id="nav">
<ul><li>
@@ -104,15 +104,6 @@ handled correctly. The error may fall through an on-trace
garbage collector.
</li>
</ul>
-
-<h2>Roadmap</h2>
-<p>
-Please refer to the
-<a href="http://www.freelists.org/post/luajit/LuaJIT-Roadmap-20122013"><span class="ext">&raquo;</span>&nbsp;LuaJIT Roadmap 2012/2013</a> and an
-<a href="http://www.freelists.org/post/luajit/LuaJIT-Roadmap-20122013-UPDATE"><span class="ext">&raquo;</span>&nbsp;update on release planning</a> for details.
-</p>
-<p>
-</p>
<br class="flush">
</div>
<div id="foot">
diff --git a/dynasm/dasm_arm.lua b/dynasm/dasm_arm.lua
index 90a259c..6a1d1d5 100644
--- a/dynasm/dasm_arm.lua
+++ b/dynasm/dasm_arm.lua
@@ -9,9 +9,9 @@
local _info = {
arch = "arm",
description = "DynASM ARM module",
- version = "1.3.0",
- vernum = 10300,
- release = "2011-05-05",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/dynasm/dasm_arm64.lua b/dynasm/dasm_arm64.lua
index 9766e47..c1e3a81 100644
--- a/dynasm/dasm_arm64.lua
+++ b/dynasm/dasm_arm64.lua
@@ -9,9 +9,9 @@
local _info = {
arch = "arm",
description = "DynASM ARM64 module",
- version = "1.3.0",
- vernum = 10300,
- release = "2014-12-03",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/dynasm/dasm_mips.lua b/dynasm/dasm_mips.lua
index ae0dbd7..ef38343 100644
--- a/dynasm/dasm_mips.lua
+++ b/dynasm/dasm_mips.lua
@@ -9,9 +9,9 @@
local _info = {
arch = "mips",
description = "DynASM MIPS module",
- version = "1.3.0",
- vernum = 10300,
- release = "2012-01-23",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/dynasm/dasm_ppc.h b/dynasm/dasm_ppc.h
index 2ded258..332c64d 100644
--- a/dynasm/dasm_ppc.h
+++ b/dynasm/dasm_ppc.h
@@ -21,7 +21,7 @@ enum {
/* The following actions need a buffer position. */
DASM_ALIGN, DASM_REL_LG, DASM_LABEL_LG,
/* The following actions also have an argument. */
- DASM_REL_PC, DASM_LABEL_PC, DASM_IMM,
+ DASM_REL_PC, DASM_LABEL_PC, DASM_IMM, DASM_IMMSH,
DASM__MAX
};
@@ -244,6 +244,10 @@ void dasm_put(Dst_DECL, int start, ...)
#endif
b[pos++] = n;
break;
+ case DASM_IMMSH:
+ CK((n >> 6) == 0, RANGE_I);
+ b[pos++] = n;
+ break;
}
}
}
@@ -299,7 +303,7 @@ int dasm_link(Dst_DECL, size_t *szp)
case DASM_ALIGN: ofs -= (b[pos++] + ofs) & (ins & 255); break;
case DASM_REL_LG: case DASM_REL_PC: pos++; break;
case DASM_LABEL_LG: case DASM_LABEL_PC: b[pos++] += ofs; break;
- case DASM_IMM: pos++; break;
+ case DASM_IMM: case DASM_IMMSH: pos++; break;
}
}
stop: (void)0;
@@ -366,6 +370,9 @@ int dasm_encode(Dst_DECL, void *buffer)
case DASM_IMM:
cp[-1] |= (n & ((1<<((ins>>5)&31))-1)) << (ins&31);
break;
+ case DASM_IMMSH:
+ cp[-1] |= (ins & 1) ? ((n&31)<<11)|((n&32)>>4) : ((n&31)<<6)|(n&32);
+ break;
default: *cp++ = ins; break;
}
}
diff --git a/dynasm/dasm_ppc.lua b/dynasm/dasm_ppc.lua
index 3744707..1e9bcca 100644
--- a/dynasm/dasm_ppc.lua
+++ b/dynasm/dasm_ppc.lua
@@ -11,9 +11,9 @@
local _info = {
arch = "ppc",
description = "DynASM PPC module",
- version = "1.3.0",
- vernum = 10300,
- release = "2015-01-14",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
@@ -41,7 +41,7 @@ local wline, werror, wfatal, wwarn
local action_names = {
"STOP", "SECTION", "ESC", "REL_EXT",
"ALIGN", "REL_LG", "LABEL_LG",
- "REL_PC", "LABEL_PC", "IMM",
+ "REL_PC", "LABEL_PC", "IMM", "IMMSH"
}
-- Maximum number of section buffer positions for dasm_put().
@@ -230,8 +230,18 @@ local map_cond = {
------------------------------------------------------------------------------
+local map_op, op_template
+
+local function op_alias(opname, f)
+ return function(params, nparams)
+ if not params then return "-> "..opname:sub(1, -3) end
+ f(params, nparams)
+ op_template(params, map_op[opname], nparams)
+ end
+end
+
-- Template strings for PPC instructions.
-local map_op = {
+map_op = {
tdi_3 = "08000000ARI",
twi_3 = "0c000000ARI",
mulli_3 = "1c000000RRI",
@@ -299,6 +309,33 @@ local map_op = {
std_2 = "f8000000RD",
stdu_2 = "f8000001RD",
+ subi_3 = op_alias("addi_3", function(p) p[3] = "-("..p[3]..")" end),
+ subis_3 = op_alias("addis_3", function(p) p[3] = "-("..p[3]..")" end),
+ subic_3 = op_alias("addic_3", function(p) p[3] = "-("..p[3]..")" end),
+ ["subic._3"] = op_alias("addic._3", function(p) p[3] = "-("..p[3]..")" end),
+
+ rotlwi_3 = op_alias("rlwinm_5", function(p)
+ p[4] = "0"; p[5] = "31"
+ end),
+ rotrwi_3 = op_alias("rlwinm_5", function(p)
+ p[3] = "32-("..p[3]..")"; p[4] = "0"; p[5] = "31"
+ end),
+ rotlw_3 = op_alias("rlwnm_5", function(p)
+ p[4] = "0"; p[5] = "31"
+ end),
+ slwi_3 = op_alias("rlwinm_5", function(p)
+ p[5] = "31-("..p[3]..")"; p[4] = "0"
+ end),
+ srwi_3 = op_alias("rlwinm_5", function(p)
+ p[4] = p[3]; p[3] = "32-("..p[3]..")"; p[5] = "31"
+ end),
+ clrlwi_3 = op_alias("rlwinm_5", function(p)
+ p[4] = p[3]; p[3] = "0"; p[5] = "31"
+ end),
+ clrrwi_3 = op_alias("rlwinm_5", function(p)
+ p[5] = "31-("..p[3]..")"; p[3] = "0"; p[4] = "0"
+ end),
+
-- Primary opcode 4:
mulhhwu_3 = "10000010RRR.",
machhwu_3 = "10000018RRR.",
@@ -790,6 +827,28 @@ local map_op = {
rldcl_4 = "78000010RR~RM.",
rldcr_4 = "78000012RR~RM.",
+ rotldi_3 = op_alias("rldicl_4", function(p)
+ p[4] = "0"
+ end),
+ rotrdi_3 = op_alias("rldicl_4", function(p)
+ p[3] = "64-("..p[3]..")"; p[4] = "0"
+ end),
+ rotld_3 = op_alias("rldcl_4", function(p)
+ p[4] = "0"
+ end),
+ sldi_3 = op_alias("rldicr_4", function(p)
+ p[4] = "63-("..p[3]..")"
+ end),
+ srdi_3 = op_alias("rldicl_4", function(p)
+ p[4] = p[3]; p[3] = "64-("..p[3]..")"
+ end),
+ clrldi_3 = op_alias("rldicl_4", function(p)
+ p[4] = p[3]; p[3] = "0"
+ end),
+ clrrdi_3 = op_alias("rldicr_4", function(p)
+ p[4] = "63-("..p[3]..")"; p[3] = "0"
+ end),
+
-- Primary opcode 56:
lq_2 = "e0000000R:D", -- NYI: displacement must be divisible by 8.
@@ -1358,7 +1417,7 @@ local map_op = {
do
local t = {}
for k,v in pairs(map_op) do
- if sub(v, -1) == "." then
+ if type(v) == "string" and sub(v, -1) == "." then
local v2 = sub(v, 1, 7)..char(byte(v, 8)+1)..sub(v, 9, -2)
t[sub(k, 1, -3).."."..sub(k, -2)] = v2
end
@@ -1454,8 +1513,30 @@ local function parse_cond(expr)
werror("bad condition bit name `"..expr.."'")
end
+local parse_ctx = {}
+
+local loadenv = setfenv and function(s)
+ local code = loadstring(s, "")
+ if code then setfenv(code, parse_ctx) end
+ return code
+end or function(s)
+ return load(s, "", nil, parse_ctx)
+end
+
+-- Try to parse simple arithmetic, too, since some basic ops are aliases.
+local function parse_number(n)
+ local x = tonumber(n)
+ if x then return x end
+ local code = loadenv("return "..n)
+ if code then
+ local ok, y = pcall(code)
+ if ok then return y end
+ end
+ return nil
+end
+
local function parse_imm(imm, bits, shift, scale, signed)
- local n = tonumber(imm)
+ local n = parse_number(imm)
if n then
local m = sar(n, scale)
if shl(m, scale) == n then
@@ -1479,11 +1560,11 @@ local function parse_imm(imm, bits, shift, scale, signed)
end
local function parse_shiftmask(imm, isshift)
- local n = tonumber(imm)
+ local n = parse_number(imm)
if n then
if shr(n, 6) == 0 then
- local lsb = band(imm, 31)
- local msb = imm - lsb
+ local lsb = band(n, 31)
+ local msb = n - lsb
return isshift and (shl(lsb, 11)+shr(msb, 4)) or (shl(lsb, 6)+msb)
end
werror("out of range immediate `"..imm.."'")
@@ -1491,7 +1572,8 @@ local function parse_shiftmask(imm, isshift)
match(imm, "^([%w_]+):(r[1-3]?[0-9])$") then
werror("expected immediate operand, got register")
else
- werror("NYI: parameterized 64 bit shift/mask")
+ waction("IMMSH", isshift and 1 or 0, imm)
+ return 0;
end
end
@@ -1566,7 +1648,7 @@ end
------------------------------------------------------------------------------
-- Handle opcodes defined with template strings.
-map_op[".template__"] = function(params, template, nparams)
+op_template = function(params, template, nparams)
if not params then return sub(template, 9) end
local op = tonumber(sub(template, 1, 8), 16)
local n, rs = 1, 26
@@ -1668,6 +1750,8 @@ map_op[".template__"] = function(params, template, nparams)
wputpos(pos, op)
end
+map_op[".template__"] = op_template
+
------------------------------------------------------------------------------
-- Pseudo-opcode to mark the position where the action list is to be emitted.
diff --git a/dynasm/dasm_proto.h b/dynasm/dasm_proto.h
index a8bc6fd..93ca065 100644
--- a/dynasm/dasm_proto.h
+++ b/dynasm/dasm_proto.h
@@ -10,8 +10,8 @@
#include <stddef.h>
#include <stdarg.h>
-#define DASM_IDENT "DynASM 1.3.0"
-#define DASM_VERSION 10300 /* 1.3.0 */
+#define DASM_IDENT "DynASM 1.4.0"
+#define DASM_VERSION 10400 /* 1.4.0 */
#ifndef Dst_DECL
#define Dst_DECL dasm_State **Dst
diff --git a/dynasm/dasm_x86.h b/dynasm/dasm_x86.h
index 652e8c9..175febe 100644
--- a/dynasm/dasm_x86.h
+++ b/dynasm/dasm_x86.h
@@ -391,7 +391,12 @@ int dasm_encode(Dst_DECL, void *buffer)
case DASM_IMM_D: wd: dasmd(n); break;
case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL;
case DASM_IMM_W: dasmw(n); break;
- case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; }
+ case DASM_VREG: {
+ int t = *p++;
+ if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3;
+ cp[-1] ^= n;
+ break;
+ }
case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
b++; n = (int)(ptrdiff_t)D->globals[-n];
case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua
index 7ca061d..1fa80b5 100644
--- a/dynasm/dasm_x86.lua
+++ b/dynasm/dasm_x86.lua
@@ -11,9 +11,9 @@ local x64 = x64
local _info = {
arch = x64 and "x64" or "x86",
description = "DynASM x86/x64 module",
- version = "1.3.0",
- vernum = 10300,
- release = "2011-05-05",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
local _s = string
local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
-local concat, sort = table.concat, table.sort
+local concat, sort, remove = table.concat, table.sort, table.remove
local bit = bit or require("bit")
-local band, shl, shr = bit.band, bit.lshift, bit.rshift
+local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
-- Inherited tables and callbacks.
local g_opt, g_arch
@@ -299,7 +299,7 @@ local function mkrmap(sz, cl, names)
local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
if needrex then map_reg_needrex[iname] = true end
local name
- if sz == "o" then name = format("xmm%d", i)
+ if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
elseif sz == "f" then name = format("st%d", i)
else name = format("r%d%s", i, sz == addrsize and "" or sz) end
map_archdef[name] = iname
@@ -334,21 +334,24 @@ mkrmap("f", "Rf")
-- SSE registers (oword sized, but qword and dword accessible).
mkrmap("o", "xmm")
+-- AVX registers (yword sized, but oword, qword and dword accessible).
+mkrmap("y", "ymm")
+
-- Operand size prefixes to codes.
local map_opsize = {
- byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t",
- aword = addrsize,
+ byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
+ tword = "t", aword = addrsize,
}
-- Operand size code to number.
local map_opsizenum = {
- b = 1, w = 2, d = 4, q = 8, o = 16, t = 10,
+ b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
}
-- Operand size code to name.
local map_opsizename = {
- b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword",
- f = "fpword",
+ b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
+ t = "tword", f = "fpword",
}
-- Valid index register scale factors.
@@ -460,7 +463,29 @@ local function wputszarg(sz, n)
end
-- Put multi-byte opcode with operand-size dependent modifications.
-local function wputop(sz, op, rex)
+local function wputop(sz, op, rex, vex)
+ if vex then
+ local tail
+ if vex.m == 1 and band(rex, 11) == 0 then
+ wputb(0xc5)
+ tail = shl(bxor(band(rex, 4), 4), 5)
+ else
+ wputb(0xc4)
+ wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
+ tail = shl(band(rex, 8), 4)
+ end
+ local reg, vreg = 0, nil
+ if vex.v then
+ reg = vex.v.reg
+ if not reg then werror("bad vex operand") end
+ if reg < 0 then reg = 0; vreg = vex.v.vreg end
+ end
+ if sz == "y" or vex.l then tail = tail + 4 end
+ wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
+ if vreg then waction("VREG", vreg); wputxb(4) end
+ rex = 0
+ if op >= 256 then werror("bad vex opcode") end
+ end
local r
if rex ~= 0 and not x64 then werror("bad operand size") end
if sz == "w" then wputb(102) end
@@ -881,9 +906,15 @@ end
-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand.
-- The spare 3 bits are either filled with the last hex digit or
-- the result from a previous "r"/"R". The opcode is restored.
+-- "u" Use VEX encoding, vvvv unused.
+-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is
+-- removed from the list used by future characters).
+-- "L" Force VEX.L
--
-- All of the following characters force a flush of the opcode:
-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand.
+-- "s" stores a 4 bit immediate from the last register operand,
+-- followed by 4 zero bits.
-- "S" stores a signed 8 bit immediate from the last operand.
-- "U" stores an unsigned 8 bit immediate from the last operand.
-- "W" stores an unsigned 16 bit immediate from the last operand.
@@ -1081,10 +1112,11 @@ local map_op = {
btr_2 = "mrqdw:0FB3Rm|miqdw:0FBA6mU",
bts_2 = "mrqdw:0FABRm|miqdw:0FBA5mU",
- shld_3 = "mriqdw:0FA4RmU|mrCqdw:0FA5Rm",
- shrd_3 = "mriqdw:0FACRmU|mrCqdw:0FADRm",
+ shld_3 = "mriqdw:0FA4RmU|mrC/qq:0FA5Rm|mrC/dd:|mrC/ww:",
+ shrd_3 = "mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:",
rdtsc_0 = "0F31", -- P1+
+ rdpmc_0 = "0F33", -- P6+
cpuid_0 = "0FA2", -- P1+
-- floating point ops
@@ -1190,7 +1222,7 @@ local map_op = {
cvtsi2sd_2 = "rm/od:F20F2ArM|rm/oq:F20F2ArXM",
cvtsi2ss_2 = "rm/od:F30F2ArM|rm/oq:F30F2ArXM",
cvtss2sd_2 = "rro:F30F5ArM|rx/od:",
- cvtss2si_2 = "rr/do:F20F2CrM|rr/qo:|rxd:|rx/qd:",
+ cvtss2si_2 = "rr/do:F30F2DrM|rr/qo:|rxd:|rx/qd:",
cvttpd2dq_2 = "rmo:660FE6rM",
cvttps2dq_2 = "rmo:F30F5BrM",
cvttsd2si_2 = "rr/do:F20F2CrM|rr/qo:|rx/dq:|rxq:",
@@ -1225,46 +1257,14 @@ local map_op = {
movups_2 = "rmo:0F10rM|mro:0F11Rm",
orpd_2 = "rmo:660F56rM",
orps_2 = "rmo:0F56rM",
- packssdw_2 = "rmo:660F6BrM",
- packsswb_2 = "rmo:660F63rM",
- packuswb_2 = "rmo:660F67rM",
- paddb_2 = "rmo:660FFCrM",
- paddd_2 = "rmo:660FFErM",
- paddq_2 = "rmo:660FD4rM",
- paddsb_2 = "rmo:660FECrM",
- paddsw_2 = "rmo:660FEDrM",
- paddusb_2 = "rmo:660FDCrM",
- paddusw_2 = "rmo:660FDDrM",
- paddw_2 = "rmo:660FFDrM",
- pand_2 = "rmo:660FDBrM",
- pandn_2 = "rmo:660FDFrM",
pause_0 = "F390",
- pavgb_2 = "rmo:660FE0rM",
- pavgw_2 = "rmo:660FE3rM",
- pcmpeqb_2 = "rmo:660F74rM",
- pcmpeqd_2 = "rmo:660F76rM",
- pcmpeqw_2 = "rmo:660F75rM",
- pcmpgtb_2 = "rmo:660F64rM",
- pcmpgtd_2 = "rmo:660F66rM",
- pcmpgtw_2 = "rmo:660F65rM",
- pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nrMU", -- Mem op: SSE4.1 only.
+ pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
- pmaddwd_2 = "rmo:660FF5rM",
- pmaxsw_2 = "rmo:660FEErM",
- pmaxub_2 = "rmo:660FDErM",
- pminsw_2 = "rmo:660FEArM",
- pminub_2 = "rmo:660FDArM",
pmovmskb_2 = "rr/do:660FD7rM",
- pmulhuw_2 = "rmo:660FE4rM",
- pmulhw_2 = "rmo:660FE5rM",
- pmullw_2 = "rmo:660FD5rM",
- pmuludq_2 = "rmo:660FF4rM",
- por_2 = "rmo:660FEBrM",
prefetchnta_1 = "xb:n0F180m",
prefetcht0_1 = "xb:n0F181m",
prefetcht1_1 = "xb:n0F182m",
prefetcht2_1 = "xb:n0F183m",
- psadbw_2 = "rmo:660FF6rM",
pshufd_3 = "rmio:660F70rMU",
pshufhw_3 = "rmio:F30F70rMU",
pshuflw_3 = "rmio:F20F70rMU",
@@ -1278,23 +1278,6 @@ local map_op = {
psrldq_2 = "rio:660F733mU",
psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
- psubb_2 = "rmo:660FF8rM",
- psubd_2 = "rmo:660FFArM",
- psubq_2 = "rmo:660FFBrM",
- psubsb_2 = "rmo:660FE8rM",
- psubsw_2 = "rmo:660FE9rM",
- psubusb_2 = "rmo:660FD8rM",
- psubusw_2 = "rmo:660FD9rM",
- psubw_2 = "rmo:660FF9rM",
- punpckhbw_2 = "rmo:660F68rM",
- punpckhdq_2 = "rmo:660F6ArM",
- punpckhqdq_2 = "rmo:660F6DrM",
- punpckhwd_2 = "rmo:660F69rM",
- punpcklbw_2 = "rmo:660F60rM",
- punpckldq_2 = "rmo:660F62rM",
- punpcklqdq_2 = "rmo:660F6CrM",
- punpcklwd_2 = "rmo:660F61rM",
- pxor_2 = "rmo:660FEFrM",
rcpps_2 = "rmo:0F53rM",
rcpss_2 = "rro:F30F53rM|rx/od:",
rsqrtps_2 = "rmo:0F52rM",
@@ -1352,7 +1335,7 @@ local map_op = {
dpps_3 = "rmio:660F3A40rMU",
extractps_3 = "mri/do:660F3A17RmU|rri/qo:660F3A17RXmU",
insertps_3 = "rrio:660F3A41rMU|rxi/od:",
- movntdqa_2 = "rmo:660F382ArM",
+ movntdqa_2 = "rxo:660F382ArM",
mpsadbw_3 = "rmio:660F3A42rMU",
packusdw_2 = "rmo:660F382BrM",
pblendvb_3 = "rmRo:660F3810rM",
@@ -1412,6 +1395,232 @@ local map_op = {
movntsd_2 = "xr/qo:nF20F2BRm",
movntss_2 = "xr/do:F30F2BRm",
-- popcnt is also in SSE4.2
+
+ -- AES-NI
+ aesdec_2 = "rmo:660F38DErM",
+ aesdeclast_2 = "rmo:660F38DFrM",
+ aesenc_2 = "rmo:660F38DCrM",
+ aesenclast_2 = "rmo:660F38DDrM",
+ aesimc_2 = "rmo:660F38DBrM",
+ aeskeygenassist_3 = "rmio:660F3ADFrMU",
+ pclmulqdq_3 = "rmio:660F3A44rMU",
+
+ -- AVX FP ops
+ vaddsubpd_3 = "rrmoy:660FVD0rM",
+ vaddsubps_3 = "rrmoy:F20FVD0rM",
+ vandpd_3 = "rrmoy:660FV54rM",
+ vandps_3 = "rrmoy:0FV54rM",
+ vandnpd_3 = "rrmoy:660FV55rM",
+ vandnps_3 = "rrmoy:0FV55rM",
+ vblendpd_4 = "rrmioy:660F3AV0DrMU",
+ vblendps_4 = "rrmioy:660F3AV0CrMU",
+ vblendvpd_4 = "rrmroy:660F3AV4BrMs",
+ vblendvps_4 = "rrmroy:660F3AV4ArMs",
+ vbroadcastf128_2 = "rx/yo:660F38u1ArM",
+ vcmppd_4 = "rrmioy:660FVC2rMU",
+ vcmpps_4 = "rrmioy:0FVC2rMU",
+ vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:",
+ vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:",
+ vcomisd_2 = "rro:660Fu2FrM|rx/oq:",
+ vcomiss_2 = "rro:0Fu2FrM|rx/od:",
+ vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:",
+ vcvtdq2ps_2 = "rmoy:0Fu5BrM",
+ vcvtpd2dq_2 = "rmoy:F20FuE6rM",
+ vcvtpd2ps_2 = "rmoy:660Fu5ArM",
+ vcvtps2dq_2 = "rmoy:660Fu5BrM",
+ vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:",
+ vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
+ vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:",
+ vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
+ vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
+ vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:",
+ vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
+ vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
+ vcvttps2dq_2 = "rmoy:F30Fu5BrM",
+ vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
+ vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
+ vdppd_4 = "rrmio:660F3AV41rMU",
+ vdpps_4 = "rrmioy:660F3AV40rMU",
+ vextractf128_3 = "mri/oy:660F3AuL19RmU",
+ vextractps_3 = "mri/do:660F3Au17RmU",
+ vhaddpd_3 = "rrmoy:660FV7CrM",
+ vhaddps_3 = "rrmoy:F20FV7CrM",
+ vhsubpd_3 = "rrmoy:660FV7DrM",
+ vhsubps_3 = "rrmoy:F20FV7DrM",
+ vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
+ vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:",
+ vldmxcsr_1 = "xd:0FuAE2m",
+ vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
+ vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
+ vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm",
+ vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm",
+ vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
+ vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
+ vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:",
+ vmovhlps_3 = "rrro:0FV12rM",
+ vmovhpd_2 = "xr/qo:660Fu17Rm",
+ vmovhpd_3 = "rrx/ooq:660FV16rM",
+ vmovhps_2 = "xr/qo:0Fu17Rm",
+ vmovhps_3 = "rrx/ooq:0FV16rM",
+ vmovlhps_3 = "rrro:0FV16rM",
+ vmovlpd_2 = "xr/qo:660Fu13Rm",
+ vmovlpd_3 = "rrx/ooq:660FV12rM",
+ vmovlps_2 = "xr/qo:0Fu13Rm",
+ vmovlps_3 = "rrx/ooq:0FV12rM",
+ vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM",
+ vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM",
+ vmovntpd_2 = "xroy:660Fu2BRm",
+ vmovntps_2 = "xroy:0Fu2BRm",
+ vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
+ vmovsd_3 = "rrro:F20FV10rM",
+ vmovshdup_2 = "rmoy:F30Fu16rM",
+ vmovsldup_2 = "rmoy:F30Fu12rM",
+ vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
+ vmovss_3 = "rrro:F30FV10rM",
+ vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm",
+ vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm",
+ vorpd_3 = "rrmoy:660FV56rM",
+ vorps_3 = "rrmoy:0FV56rM",
+ vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
+ vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
+ vperm2f128_4 = "rrmiy:660F3AV06rMU",
+ vptestpd_2 = "rmoy:660F38u0FrM",
+ vptestps_2 = "rmoy:660F38u0ErM",
+ vrcpps_2 = "rmoy:0Fu53rM",
+ vrcpss_3 = "rrro:F30FV53rM|rrx/ood:",
+ vrsqrtps_2 = "rmoy:0Fu52rM",
+ vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:",
+ vroundpd_3 = "rmioy:660F3AV09rMU",
+ vroundps_3 = "rmioy:660F3AV08rMU",
+ vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:",
+ vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:",
+ vshufpd_4 = "rrmioy:660FVC6rMU",
+ vshufps_4 = "rrmioy:0FVC6rMU",
+ vsqrtps_2 = "rmoy:0Fu51rM",
+ vsqrtss_2 = "rro:F30Fu51rM|rx/od:",
+ vsqrtpd_2 = "rmoy:660Fu51rM",
+ vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:",
+ vstmxcsr_1 = "xd:0FuAE3m",
+ vucomisd_2 = "rro:660Fu2ErM|rx/oq:",
+ vucomiss_2 = "rro:0Fu2ErM|rx/od:",
+ vunpckhpd_3 = "rrmoy:660FV15rM",
+ vunpckhps_3 = "rrmoy:0FV15rM",
+ vunpcklpd_3 = "rrmoy:660FV14rM",
+ vunpcklps_3 = "rrmoy:0FV14rM",
+ vxorpd_3 = "rrmoy:660FV57rM",
+ vxorps_3 = "rrmoy:0FV57rM",
+ vzeroall_0 = "0FuL77",
+ vzeroupper_0 = "0Fu77",
+
+ -- AVX2 FP ops
+ vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
+ vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
+ -- *vgather* (!vsib)
+ vpermpd_3 = "rmiy:660F3AuX01rMU",
+ vpermps_3 = "rrmy:660F38V16rM",
+
+ -- AVX, AVX2 integer ops
+ -- In general, xmm requires AVX, ymm requires AVX2.
+ vlddqu_2 = "rxoy:F20FuF0rM",
+ vmaskmovdqu_2 = "rro:660FuF7rM",
+ vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
+ vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
+ vmovntdq_2 = "xroy:660FuE7Rm",
+ vmovntdqa_2 = "rxoy:660F38u2ArM",
+ vmpsadbw_4 = "rrmioy:660F3AV42rMU",
+ vpabsb_2 = "rmoy:660F38u1CrM",
+ vpabsd_2 = "rmoy:660F38u1ErM",
+ vpabsw_2 = "rmoy:660F38u1DrM",
+ vpackusdw_3 = "rrmoy:660F38V2BrM",
+ vpalignr_4 = "rrmioy:660F3AV0FrMU",
+ vpblendvb_4 = "rrmroy:660F3AV4CrMs",
+ vpblendw_4 = "rrmioy:660F3AV0ErMU",
+ vpclmulqdq_4 = "rrmio:660F3AV44rMU",
+ vpcmpeqq_3 = "rrmoy:660F38V29rM",
+ vpcmpestri_3 = "rmio:660F3Au61rMU",
+ vpcmpestrm_3 = "rmio:660F3Au60rMU",
+ vpcmpgtq_3 = "rrmoy:660F38V37rM",
+ vpcmpistri_3 = "rmio:660F3Au63rMU",
+ vpcmpistrm_3 = "rmio:660F3Au62rMU",
+ vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
+ vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
+ vpextrd_3 = "mri/do:660F3Au16RmU",
+ vpextrq_3 = "mri/qo:660F3Au16RmU",
+ vphaddw_3 = "rrmoy:660F38V01rM",
+ vphaddd_3 = "rrmoy:660F38V02rM",
+ vphaddsw_3 = "rrmoy:660F38V03rM",
+ vphminposuw_2 = "rmo:660F38u41rM",
+ vphsubw_3 = "rrmoy:660F38V05rM",
+ vphsubd_3 = "rrmoy:660F38V06rM",
+ vphsubsw_3 = "rrmoy:660F38V07rM",
+ vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:",
+ vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:",
+ vpinsrd_4 = "rrmi/ood:660F3AV22rMU",
+ vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU",
+ vpmaddubsw_3 = "rrmoy:660F38V04rM",
+ vpmaxsb_3 = "rrmoy:660F38V3CrM",
+ vpmaxsd_3 = "rrmoy:660F38V3DrM",
+ vpmaxuw_3 = "rrmoy:660F38V3ErM",
+ vpmaxud_3 = "rrmoy:660F38V3FrM",
+ vpminsb_3 = "rrmoy:660F38V38rM",
+ vpminsd_3 = "rrmoy:660F38V39rM",
+ vpminuw_3 = "rrmoy:660F38V3ArM",
+ vpminud_3 = "rrmoy:660F38V3BrM",
+ vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM",
+ vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:",
+ vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:",
+ vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:",
+ vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:",
+ vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:",
+ vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:",
+ vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:",
+ vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:",
+ vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:",
+ vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:",
+ vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:",
+ vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:",
+ vpmuldq_3 = "rrmoy:660F38V28rM",
+ vpmulhrsw_3 = "rrmoy:660F38V0BrM",
+ vpmulld_3 = "rrmoy:660F38V40rM",
+ vpshufb_3 = "rrmoy:660F38V00rM",
+ vpshufd_3 = "rmioy:660Fu70rMU",
+ vpshufhw_3 = "rmioy:F30Fu70rMU",
+ vpshuflw_3 = "rmioy:F20Fu70rMU",
+ vpsignb_3 = "rrmoy:660F38V08rM",
+ vpsignw_3 = "rrmoy:660F38V09rM",
+ vpsignd_3 = "rrmoy:660F38V0ArM",
+ vpslldq_3 = "rrioy:660Fv737mU",
+ vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU",
+ vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU",
+ vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU",
+ vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU",
+ vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU",
+ vpsrldq_3 = "rrioy:660Fv733mU",
+ vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU",
+ vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU",
+ vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU",
+ vptest_2 = "rmoy:660F38u17rM",
+
+ -- AVX2 integer ops
+ vbroadcasti128_2 = "rx/yo:660F38u5ArM",
+ vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
+ vextracti128_3 = "mri/oy:660F3AuL39RmU",
+ vpblendd_4 = "rrmioy:660F3AV02rMU",
+ vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
+ vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
+ vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
+ vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
+ vpermd_3 = "rrmy:660F38V36rM",
+ vpermq_3 = "rmiy:660F3AuX00rMU",
+ -- *vpgather* (!vsib)
+ vperm2i128_4 = "rrmiy:660F3AV46rMU",
+ vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
+ vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
+ vpsllvd_3 = "rrmoy:660F38V47rM",
+ vpsllvq_3 = "rrmoy:660F38VX47rM",
+ vpsravd_3 = "rrmoy:660F38V46rM",
+ vpsrlvd_3 = "rrmoy:660F38V45rM",
+ vpsrlvq_3 = "rrmoy:660F38VX45rM",
}
------------------------------------------------------------------------------
@@ -1462,28 +1671,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
end
--- SSE FP arithmetic ops.
+-- SSE / AVX FP arithmetic ops.
for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
sub = 12, min = 13, div = 14, max = 15 } do
map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
+ if n ~= 1 then
+ map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
+ map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
+ map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
+ map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
+ end
+end
+
+-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
+for name,n in pairs{
+ paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
+ paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
+ packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
+ paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
+ pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
+ pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
+ pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
+ pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
+ pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
+ pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
+ psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
+ psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
+ punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
+ punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
+ punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
+} do
+ map_op[name.."_2"] = format("rmo:660F%02XrM", n)
+ map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
end
------------------------------------------------------------------------------
+local map_vexarg = { u = false, v = 1, V = 2 }
+
-- Process pattern string.
local function dopattern(pat, args, sz, op, needrex)
- local digit, addin
+ local digit, addin, vex
local opcode = 0
local szov = sz
local narg = 1
local rex = 0
-- Limit number of section buffer positions used by a single dasm_put().
- -- A single opcode needs a maximum of 5 positions.
- if secpos+5 > maxsecpos then wflush() end
+ -- A single opcode needs a maximum of 6 positions.
+ if secpos+6 > maxsecpos then wflush() end
-- Process each character.
for c in gmatch(pat.."|", ".") do
@@ -1497,6 +1736,8 @@ local function dopattern(pat, args, sz, op, needrex)
szov = nil
elseif c == "X" then -- Force REX.W.
rex = 8
+ elseif c == "L" then -- Force VEX.L.
+ vex.l = true
elseif c == "r" then -- Merge 1st operand regno. into opcode.
addin = args[1]; opcode = opcode + (addin.reg % 8)
if narg < 2 then narg = 2 end
@@ -1520,21 +1761,41 @@ local function dopattern(pat, args, sz, op, needrex)
if t.xreg and t.xreg > 7 then rex = rex + 2 end
if s > 7 then rex = rex + 4 end
if needrex then rex = rex + 16 end
- wputop(szov, opcode, rex); opcode = nil
+ wputop(szov, opcode, rex, vex); opcode = nil
local imark = sub(pat, -1) -- Force a mark (ugly).
-- Put ModRM/SIB with regno/last digit as spare.
wputmrmsib(t, imark, s, addin and addin.vreg)
addin = nil
+ elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
+ local b = band(opcode, 255); opcode = shr(opcode, 8)
+ local m = 1
+ if b == 0x38 then m = 2
+ elseif b == 0x3a then m = 3 end
+ if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
+ if b ~= 0x0f then
+ werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
+ "' in pattern `"..pat.."' for `"..op.."'")
+ end
+ local v = map_vexarg[c]
+ if v then v = remove(args, v) end
+ b = band(opcode, 255)
+ local p = 0
+ if b == 0x66 then p = 1
+ elseif b == 0xf3 then p = 2
+ elseif b == 0xf2 then p = 3 end
+ if p ~= 0 then opcode = shr(opcode, 8) end
+ if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
+ vex = { m = m, p = p, v = v }
else
if opcode then -- Flush opcode.
if szov == "q" and rex == 0 then rex = rex + 8 end
if needrex then rex = rex + 16 end
if addin and addin.reg == -1 then
- wputop(szov, opcode - 7, rex)
+ wputop(szov, opcode - 7, rex, vex)
waction("VREG", addin.vreg); wputxb(0)
else
if addin and addin.reg > 7 then rex = rex + 1 end
- wputop(szov, opcode, rex)
+ wputop(szov, opcode, rex, vex)
end
opcode = nil
end
@@ -1571,6 +1832,14 @@ local function dopattern(pat, args, sz, op, needrex)
else
wputlabel("REL_", imm, 2)
end
+ elseif c == "s" then
+ local reg = a.reg
+ if reg < 0 then
+ wputb(0)
+ waction("VREG", a.vreg); wputxb(5)
+ else
+ wputb(shl(reg, 4))
+ end
else
werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
end
@@ -1647,11 +1916,14 @@ map_op[".template__"] = function(params, template, nparams)
if pat == "" then pat = lastpat else lastpat = pat end
if matchtm(tm, args) then
local prefix = sub(szm, 1, 1)
- if prefix == "/" then -- Match both operand sizes.
- if args[1].opsize == sub(szm, 2, 2) and
- args[2].opsize == sub(szm, 3, 3) then
- dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
- return
+ if prefix == "/" then -- Exactly match leading operand sizes.
+ for i = #szm,1,-1 do
+ if i == 1 then
+ dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
+ return
+ elseif args[i-1].opsize ~= sub(szm, i, i) then
+ break
+ end
end
else -- Match common operand size.
local szp = sz
diff --git a/dynasm/dynasm.lua b/dynasm/dynasm.lua
index fffda75..145fb0c 100644
--- a/dynasm/dynasm.lua
+++ b/dynasm/dynasm.lua
@@ -10,9 +10,9 @@
local _info = {
name = "DynASM",
description = "A dynamic assembler for code generation engines",
- version = "1.3.0",
- vernum = 10300,
- release = "2011-05-05",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
url = "http://luajit.org/dynasm.html",
license = "MIT",
diff --git a/etc/luajit.pc b/etc/luajit.pc
index 5ec2c35..c99057f 100644
--- a/etc/luajit.pc
+++ b/etc/luajit.pc
@@ -2,7 +2,7 @@
majver=2
minver=1
relver=0
-version=${majver}.${minver}.${relver}-alpha
+version=${majver}.${minver}.${relver}-beta1
abiver=5.1
prefix=/usr/local
diff --git a/src/Makefile b/src/Makefile
index d7539fd..9845f6a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -24,11 +24,13 @@ NODOTABIVER= 51
# removing the '#' in front of them. Make sure you force a full recompile
# with "make clean", followed by "make" if you change any options.
#
+DEFAULT_CC = gcc
+#
# LuaJIT builds as a native 32 or 64 bit binary by default.
-CC= gcc
+CC= $(DEFAULT_CC)
#
# Use this if you want to force a 32 bit build on a 64 bit multilib OS.
-#CC= gcc -m32
+#CC= $(DEFAULT_CC) -m32
#
# Since the assembler part does NOT maintain a frame pointer, it's pointless
# to slow down the C part by not omitting it. Debugging, tracebacks and
@@ -148,6 +150,29 @@ XCFLAGS=
##############################################################################
##############################################################################
+# Host system detection.
+##############################################################################
+
+ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM))
+ HOST_SYS= Windows
+ HOST_RM= del
+else
+ HOST_SYS:= $(shell uname -s)
+ ifneq (,$(findstring MINGW,$(HOST_SYS)))
+ HOST_SYS= Windows
+ HOST_MSYS= mingw
+ endif
+ ifneq (,$(findstring CYGWIN,$(HOST_SYS)))
+ HOST_SYS= Windows
+ HOST_MSYS= cygwin
+ endif
+ # Use Clang for OSX host.
+ ifeq (Darwin,$(HOST_SYS))
+ DEFAULT_CC= clang
+ endif
+endif
+
+##############################################################################
# Flags and options for host and target.
##############################################################################
@@ -221,6 +246,11 @@ ifneq (,$(findstring LJ_TARGET_ARM64 ,$(TARGET_TESTARCH)))
TARGET_LJARCH= arm64
else
ifneq (,$(findstring LJ_TARGET_PPC ,$(TARGET_TESTARCH)))
+ ifneq (,$(findstring LJ_LE 1,$(TARGET_TESTARCH)))
+ TARGET_ARCH= -DLJ_ARCH_ENDIAN=LUAJIT_LE
+ else
+ TARGET_ARCH= -DLJ_ARCH_ENDIAN=LUAJIT_BE
+ endif
TARGET_LJARCH= ppc
else
ifneq (,$(findstring LJ_TARGET_MIPS ,$(TARGET_TESTARCH)))
@@ -243,9 +273,6 @@ ifneq (,$(findstring LJ_TARGET_PS3 1,$(TARGET_TESTARCH)))
TARGET_XCFLAGS+= -DLUAJIT_USE_SYSMALLOC
TARGET_XLIBS+= -lpthread
endif
-ifneq (,$(findstring LJ_NO_UNWIND 1,$(TARGET_TESTARCH)))
- TARGET_ARCH+= -DLUAJIT_NO_UNWIND
-endif
TARGET_XCFLAGS+= $(CCOPT_$(TARGET_LJARCH))
TARGET_ARCH+= $(patsubst %,-DLUAJIT_TARGET=LUAJIT_ARCH_%,$(TARGET_LJARCH))
@@ -266,24 +293,9 @@ ifneq (,$(LMULTILIB))
endif
##############################################################################
-# System detection.
+# Target system detection.
##############################################################################
-ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM))
- HOST_SYS= Windows
- HOST_RM= del
-else
- HOST_SYS:= $(shell uname -s)
- ifneq (,$(findstring MINGW,$(HOST_SYS)))
- HOST_SYS= Windows
- HOST_MSYS= mingw
- endif
- ifneq (,$(findstring CYGWIN,$(HOST_SYS)))
- HOST_SYS= Windows
- HOST_MSYS= cygwin
- endif
-endif
-
TARGET_SYS?= $(HOST_SYS)
ifeq (Windows,$(TARGET_SYS))
TARGET_STRIP+= --strip-unneeded
@@ -400,6 +412,10 @@ ifeq (,$(findstring LJ_ABI_SOFTFP 1,$(TARGET_TESTARCH)))
else
TARGET_ARCH+= -DLJ_ABI_SOFTFP=1
endif
+ifneq (,$(findstring LJ_NO_UNWIND 1,$(TARGET_TESTARCH)))
+ DASM_AFLAGS+= -D NO_UNWIND
+ TARGET_ARCH+= -DLUAJIT_NO_UNWIND
+endif
DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH))))
ifeq (Windows,$(TARGET_SYS))
DASM_AFLAGS+= -D WIN
diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua
index ef5a1aa..38fc61f 100644
--- a/src/jit/bcsave.lua
+++ b/src/jit/bcsave.lua
@@ -63,7 +63,7 @@ local map_type = {
}
local map_arch = {
- x86 = true, x64 = true, arm = true, ppc = true,
+ x86 = true, x64 = true, arm = true, arm64 = true, ppc = true,
mips = true, mipsel = true,
}
@@ -200,7 +200,7 @@ typedef struct {
]]
local symname = LJBC_PREFIX..ctx.modname
local is64, isbe = false, false
- if ctx.arch == "x64" then
+ if ctx.arch == "x64" or ctx.arch == "arm64" then
is64 = true
elseif ctx.arch == "ppc" or ctx.arch == "mips" then
isbe = true
@@ -237,7 +237,7 @@ typedef struct {
hdr.eendian = isbe and 2 or 1
hdr.eversion = 1
hdr.type = f16(1)
- hdr.machine = f16(({ x86=3, x64=62, arm=40, ppc=20, mips=8, mipsel=8 })[ctx.arch])
+ hdr.machine = f16(({ x86=3, x64=62, arm=40, arm64=183, ppc=20, mips=8, mipsel=8 })[ctx.arch])
if ctx.arch == "mips" or ctx.arch == "mipsel" then
hdr.flags = 0x50001006
end
@@ -477,13 +477,13 @@ typedef struct {
} mach_obj_64;
typedef struct {
mach_fat_header fat;
- mach_fat_arch fat_arch[4];
+ mach_fat_arch fat_arch[2];
struct {
mach_header hdr;
mach_segment_command seg;
mach_section sec;
mach_symtab_command sym;
- } arch[4];
+ } arch[2];
mach_nlist sym_entry;
uint8_t space[4096];
} mach_fat_obj;
@@ -494,6 +494,8 @@ typedef struct {
is64, align, mobj = true, 8, "mach_obj_64"
elseif ctx.arch == "arm" then
isfat, mobj = true, "mach_fat_obj"
+ elseif ctx.arch == "arm64" then
+ is64, align, isfat, mobj = true, 8, true, "mach_fat_obj"
else
check(ctx.arch == "x86", "unsupported architecture for OSX")
end
@@ -503,8 +505,8 @@ typedef struct {
-- Create Mach-O object and fill in header.
local o = ffi.new(mobj)
local mach_size = aligned(ffi.offsetof(o, "space")+#symname+2, align)
- local cputype = ({ x86={7}, x64={0x01000007}, arm={7,12,12,12} })[ctx.arch]
- local cpusubtype = ({ x86={3}, x64={3}, arm={3,6,9,11} })[ctx.arch]
+ local cputype = ({ x86={7}, x64={0x01000007}, arm={7,12}, arm64={0x01000007,0x0100000c} })[ctx.arch]
+ local cpusubtype = ({ x86={3}, x64={3}, arm={3,9}, arm64={3,0} })[ctx.arch]
if isfat then
o.fat.magic = be32(0xcafebabe)
o.fat.nfat_arch = be32(#cpusubtype)
diff --git a/src/jit/dis_x86.lua b/src/jit/dis_x86.lua
index 6bc3806..49bbcad 100644
--- a/src/jit/dis_x86.lua
+++ b/src/jit/dis_x86.lua
@@ -15,13 +15,12 @@
-- Intel and AMD manuals. The supported instruction set is quite extensive
-- and reflects what a current generation Intel or AMD CPU implements in
-- 32 bit and 64 bit mode. Yes, this includes MMX, SSE, SSE2, SSE3, SSSE3,
--- SSE4.1, SSE4.2, SSE4a and even privileged and hypervisor (VMX/SVM)
--- instructions.
+-- SSE4.1, SSE4.2, SSE4a, AVX, AVX2 and even privileged and hypervisor
+-- (VMX/SVM) instructions.
--
-- Notes:
-- * The (useless) a16 prefix, 3DNow and pre-586 opcodes are unsupported.
-- * No attempt at optimization has been made -- it's fast enough for my needs.
--- * The public API may change when more architectures are added.
------------------------------------------------------------------------------
local type = type
@@ -78,7 +77,7 @@ local map_opc1_32 = {
"movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi",
"movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI",
--Cx
-"shift!Bmu","shift!Vmu","retBw","ret","$lesVrm","$ldsVrm","movBmi","movVmi",
+"shift!Bmu","shift!Vmu","retBw","ret","vex*3$lesVrm","vex*2$ldsVrm","movBmi","movVmi",
"enterBwu","leave","retfBw","retf","int3","intBu","into","iretVS",
--Dx
"shift!Bm1","shift!Vm1","shift!Bmc","shift!Vmc","aamBu","aadBu","salc","xlatb",
@@ -103,7 +102,7 @@ local map_opc1_64 = setmetatable({
[0x44]="rex*r", [0x45]="rex*rb", [0x46]="rex*rx", [0x47]="rex*rxb",
[0x48]="rex*w", [0x49]="rex*wb", [0x4a]="rex*wx", [0x4b]="rex*wxb",
[0x4c]="rex*wr", [0x4d]="rex*wrb", [0x4e]="rex*wrx", [0x4f]="rex*wrxb",
- [0x82]=false, [0x9a]=false, [0xc4]=false, [0xc5]=false, [0xce]=false,
+ [0x82]=false, [0x9a]=false, [0xc4]="vex*3", [0xc5]="vex*2", [0xce]=false,
[0xd4]=false, [0xd5]=false, [0xd6]=false, [0xea]=false,
}, { __index = map_opc1_32 })
@@ -114,12 +113,12 @@ local map_opc2 = {
[0]="sldt!Dmp","sgdt!Ump","larVrm","lslVrm",nil,"syscall","clts","sysret",
"invd","wbinvd",nil,"ud1",nil,"$prefetch!Bm","femms","3dnowMrmu",
--1x
-"movupsXrm|movssXrm|movupdXrm|movsdXrm",
-"movupsXmr|movssXmr|movupdXmr|movsdXmr",
+"movupsXrm|movssXrvm|movupdXrm|movsdXrvm",
+"movupsXmr|movssXmvr|movupdXmr|movsdXmvr",
"movhlpsXrm$movlpsXrm|movsldupXrm|movlpdXrm|movddupXrm",
"movlpsXmr||movlpdXmr",
-"unpcklpsXrm||unpcklpdXrm",
-"unpckhpsXrm||unpckhpdXrm",
+"unpcklpsXrvm||unpcklpdXrvm",
+"unpckhpsXrvm||unpckhpdXrvm",
"movlhpsXrm$movhpsXrm|movshdupXrm|movhpdXrm",
"movhpsXmr||movhpdXmr",
"$prefetcht!Bm","hintnopVm","hintnopVm","hintnopVm",
@@ -128,7 +127,7 @@ local map_opc2 = {
"movUmx$","movUmy$","movUxm$","movUym$","movUmz$",nil,"movUzm$",nil,
"movapsXrm||movapdXrm",
"movapsXmr||movapdXmr",
-"cvtpi2psXrMm|cvtsi2ssXrVmt|cvtpi2pdXrMm|cvtsi2sdXrVmt",
+"cvtpi2psXrMm|cvtsi2ssXrvVmt|cvtpi2pdXrMm|cvtsi2sdXrvVmt",
"movntpsXmr|movntssXmr|movntpdXmr|movntsdXmr",
"cvttps2piMrXm|cvttss2siVrXm|cvttpd2piMrXm|cvttsd2siVrXm",
"cvtps2piMrXm|cvtss2siVrXm|cvtpd2piMrXm|cvtsd2siVrXm",
@@ -144,27 +143,27 @@ local map_opc2 = {
"cmovlVrm","cmovgeVrm","cmovleVrm","cmovgVrm",
--5x
"movmskpsVrXm$||movmskpdVrXm$","sqrtpsXrm|sqrtssXrm|sqrtpdXrm|sqrtsdXrm",
-"rsqrtpsXrm|rsqrtssXrm","rcppsXrm|rcpssXrm",
-"andpsXrm||andpdXrm","andnpsXrm||andnpdXrm",
-"orpsXrm||orpdXrm","xorpsXrm||xorpdXrm",
-"addpsXrm|addssXrm|addpdXrm|addsdXrm","mulpsXrm|mulssXrm|mulpdXrm|mulsdXrm",
-"cvtps2pdXrm|cvtss2sdXrm|cvtpd2psXrm|cvtsd2ssXrm",
+"rsqrtpsXrm|rsqrtssXrvm","rcppsXrm|rcpssXrvm",
+"andpsXrvm||andpdXrvm","andnpsXrvm||andnpdXrvm",
+"orpsXrvm||orpdXrvm","xorpsXrvm||xorpdXrvm",
+"addpsXrvm|addssXrvm|addpdXrvm|addsdXrvm","mulpsXrvm|mulssXrvm|mulpdXrvm|mulsdXrvm",
+"cvtps2pdXrm|cvtss2sdXrvm|cvtpd2psXrm|cvtsd2ssXrvm",
"cvtdq2psXrm|cvttps2dqXrm|cvtps2dqXrm",
-"subpsXrm|subssXrm|subpdXrm|subsdXrm","minpsXrm|minssXrm|minpdXrm|minsdXrm",
-"divpsXrm|divssXrm|divpdXrm|divsdXrm","maxpsXrm|maxssXrm|maxpdXrm|maxsdXrm",
+"subpsXrvm|subssXrvm|subpdXrvm|subsdXrvm","minpsXrvm|minssXrvm|minpdXrvm|minsdXrvm",
+"divpsXrvm|divssXrvm|divpdXrvm|divsdXrvm","maxpsXrvm|maxssXrvm|maxpdXrvm|maxsdXrvm",
--6x
-"punpcklbwPrm","punpcklwdPrm","punpckldqPrm","packsswbPrm",
-"pcmpgtbPrm","pcmpgtwPrm","pcmpgtdPrm","packuswbPrm",
-"punpckhbwPrm","punpckhwdPrm","punpckhdqPrm","packssdwPrm",
-"||punpcklqdqXrm","||punpckhqdqXrm",
+"punpcklbwPrvm","punpcklwdPrvm","punpckldqPrvm","packsswbPrvm",
+"pcmpgtbPrvm","pcmpgtwPrvm","pcmpgtdPrvm","packuswbPrvm",
+"punpckhbwPrvm","punpckhwdPrvm","punpckhdqPrvm","packssdwPrvm",
+"||punpcklqdqXrvm","||punpckhqdqXrvm",
"movPrVSm","movqMrm|movdquXrm|movdqaXrm",
--7x
"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pmu",
"pshiftd!Pmu","pshiftq!Mmu||pshiftdq!Xmu",
-"pcmpeqbPrm","pcmpeqwPrm","pcmpeqdPrm","emms|",
+"pcmpeqbPrvm","pcmpeqwPrvm","pcmpeqdPrvm","emms*|",
"vmreadUmr||extrqXmuu$|insertqXrmuu$","vmwriteUrm||extrqXrm$|insertqXrm$",
nil,nil,
-"||haddpdXrm|haddpsXrm","||hsubpdXrm|hsubpsXrm",
+"||haddpdXrvm|haddpsXrvm","||hsubpdXrvm|hsubpsXrvm",
"movVSmMr|movqXrm|movVSmXr","movqMmr|movdquXmr|movdqaXmr",
--8x
"joVj","jnoVj","jbVj","jnbVj","jzVj","jnzVj","jbeVj","jaVj",
@@ -182,27 +181,27 @@ nil,nil,
"bsfVrm","bsrVrm|lzcntVrm|bsrWrm","movsxVrBmt","movsxVrWmt",
--Cx
"xaddBmr","xaddVmr",
-"cmppsXrmu|cmpssXrmu|cmppdXrmu|cmpsdXrmu","$movntiVmr|",
-"pinsrwPrWmu","pextrwDrPmu",
-"shufpsXrmu||shufpdXrmu","$cmpxchg!Qmp",
+"cmppsXrvmu|cmpssXrvmu|cmppdXrvmu|cmpsdXrvmu","$movntiVmr|",
+"pinsrwPrvWmu","pextrwDrPmu",
+"shufpsXrvmu||shufpdXrvmu","$cmpxchg!Qmp",
"bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR",
--Dx
-"||addsubpdXrm|addsubpsXrm","psrlwPrm","psrldPrm","psrlqPrm",
-"paddqPrm","pmullwPrm",
+"||addsubpdXrvm|addsubpsXrvm","psrlwPrvm","psrldPrvm","psrlqPrvm",
+"paddqPrvm","pmullwPrvm",
"|movq2dqXrMm|movqXmr|movdq2qMrXm$","pmovmskbVrMm||pmovmskbVrXm",
-"psubusbPrm","psubuswPrm","pminubPrm","pandPrm",
-"paddusbPrm","padduswPrm","pmaxubPrm","pandnPrm",
+"psubusbPrvm","psubuswPrvm","pminubPrvm","pandPrvm",
+"paddusbPrvm","padduswPrvm","pmaxubPrvm","pandnPrvm",
--Ex
-"pavgbPrm","psrawPrm","psradPrm","pavgwPrm",
-"pmulhuwPrm","pmulhwPrm",
+"pavgbPrvm","psrawPrvm","psradPrvm","pavgwPrvm",
+"pmulhuwPrvm","pmulhwPrvm",
"|cvtdq2pdXrm|cvttpd2dqXrm|cvtpd2dqXrm","$movntqMmr||$movntdqXmr",
-"psubsbPrm","psubswPrm","pminswPrm","porPrm",
-"paddsbPrm","paddswPrm","pmaxswPrm","pxorPrm",
+"psubsbPrvm","psubswPrvm","pminswPrvm","porPrvm",
+"paddsbPrvm","paddswPrvm","pmaxswPrvm","pxorPrvm",
--Fx
-"|||lddquXrm","psllwPrm","pslldPrm","psllqPrm",
-"pmuludqPrm","pmaddwdPrm","psadbwPrm","maskmovqMrm||maskmovdquXrm$",
-"psubbPrm","psubwPrm","psubdPrm","psubqPrm",
-"paddbPrm","paddwPrm","padddPrm","ud",
+"|||lddquXrm","psllwPrvm","pslldPrvm","psllqPrvm",
+"pmuludqPrvm","pmaddwdPrvm","psadbwPrvm","maskmovqMrm||maskmovdquXrm$",
+"psubbPrvm","psubwPrvm","psubdPrvm","psubqPrvm",
+"paddbPrvm","paddwPrvm","padddPrvm","ud",
}
assert(map_opc2[255] == "ud")
@@ -210,46 +209,62 @@ assert(map_opc2[255] == "ud")
local map_opc3 = {
["38"] = { -- [66] 0f 38 xx
--0x
-[0]="pshufbPrm","phaddwPrm","phadddPrm","phaddswPrm",
-"pmaddubswPrm","phsubwPrm","phsubdPrm","phsubswPrm",
-"psignbPrm","psignwPrm","psigndPrm","pmulhrswPrm",
-nil,nil,nil,nil,
+[0]="pshufbPrvm","phaddwPrvm","phadddPrvm","phaddswPrvm",
+"pmaddubswPrvm","phsubwPrvm","phsubdPrvm","phsubswPrvm",
+"psignbPrvm","psignwPrvm","psigndPrvm","pmulhrswPrvm",
+"||permilpsXrvm","||permilpdXrvm",nil,nil,
--1x
"||pblendvbXrma",nil,nil,nil,
-"||blendvpsXrma","||blendvpdXrma",nil,"||ptestXrm",
-nil,nil,nil,nil,
+"||blendvpsXrma","||blendvpdXrma","||permpsXrvm","||ptestXrm",
+"||broadcastssXrm","||broadcastsdXrm","||broadcastf128XrlXm",nil,
"pabsbPrm","pabswPrm","pabsdPrm",nil,
--2x
"||pmovsxbwXrm","||pmovsxbdXrm","||pmovsxbqXrm","||pmovsxwdXrm",
"||pmovsxwqXrm","||pmovsxdqXrm",nil,nil,
-"||pmuldqXrm","||pcmpeqqXrm","||$movntdqaXrm","||packusdwXrm",
-nil,nil,nil,nil,
+"||pmuldqXrvm","||pcmpeqqXrvm","||$movntdqaXrm","||packusdwXrvm",
+"||maskmovpsXrvm","||maskmovpdXrvm","||maskmovpsXmvr","||maskmovpdXmvr",
--3x
"||pmovzxbwXrm","||pmovzxbdXrm","||pmovzxbqXrm","||pmovzxwdXrm",
-"||pmovzxwqXrm","||pmovzxdqXrm",nil,"||pcmpgtqXrm",
-"||pminsbXrm","||pminsdXrm","||pminuwXrm","||pminudXrm",
-"||pmaxsbXrm","||pmaxsdXrm","||pmaxuwXrm","||pmaxudXrm",
+"||pmovzxwqXrm","||pmovzxdqXrm","||permdXrvm","||pcmpgtqXrvm",
+"||pminsbXrvm","||pminsdXrvm","||pminuwXrvm","||pminudXrvm",
+"||pmaxsbXrvm","||pmaxsdXrvm","||pmaxuwXrvm","||pmaxudXrvm",
--4x
-"||pmulddXrm","||phminposuwXrm",
+"||pmulddXrvm","||phminposuwXrm",nil,nil,
+nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm",
+--5x
+[0x58] = "||pbroadcastdXrlXm",[0x59] = "||pbroadcastqXrlXm",
+[0x5a] = "||broadcasti128XrlXm",
+--7x
+[0x78] = "||pbroadcastbXrlXm",[0x79] = "||pbroadcastwXrlXm",
+--8x
+[0x8c] = "||pmaskmovXrvVSm",
+[0x8e] = "||pmaskmovVSmXvr",
--Fx
[0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt",
},
["3a"] = { -- [66] 0f 3a xx
--0x
-[0x00]=nil,nil,nil,nil,nil,nil,nil,nil,
-"||roundpsXrmu","||roundpdXrmu","||roundssXrmu","||roundsdXrmu",
-"||blendpsXrmu","||blendpdXrmu","||pblendwXrmu","palignrPrmu",
+[0x00]="||permqXrmu","||permpdXrmu","||pblenddXrvmu",nil,
+"||permilpsXrmu","||permilpdXrmu","||perm2f128Xrvmu",nil,
+"||roundpsXrmu","||roundpdXrmu","||roundssXrvmu","||roundsdXrvmu",
+"||blendpsXrvmu","||blendpdXrvmu","||pblendwXrvmu","palignrPrvmu",
--1x
nil,nil,nil,nil,
"||pextrbVmXru","||pextrwVmXru","||pextrVmSXru","||extractpsVmXru",
-nil,nil,nil,nil,nil,nil,nil,nil,
+"||insertf128XrvlXmu","||extractf128XlXmYru",nil,nil,
+nil,nil,nil,nil,
--2x
-"||pinsrbXrVmu","||insertpsXrmu","||pinsrXrVmuS",nil,
+"||pinsrbXrvVmu","||insertpsXrvmu","||pinsrXrvVmuS",nil,
+--3x
+[0x38] = "||inserti128Xrvmu",[0x39] = "||extracti128XlXmYru",
--4x
-[0x40] = "||dppsXrmu",
-[0x41] = "||dppdXrmu",
-[0x42] = "||mpsadbwXrmu",
+[0x40] = "||dppsXrvmu",
+[0x41] = "||dppdXrvmu",
+[0x42] = "||mpsadbwXrvmu",
+[0x46] = "||perm2i128Xrvmu",
+[0x4a] = "||blendvpsXrvmb",[0x4b] = "||blendvpdXrvmb",
+[0x4c] = "||pblendvbXrvmb",
--6x
[0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu",
[0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu",
@@ -356,17 +371,19 @@ local map_regs = {
"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }, -- No x64 ext!
X = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" },
+ Y = { "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
+ "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" },
}
local map_segregs = { "es", "cs", "ss", "ds", "fs", "gs", "segr6", "segr7" }
-- Maps for size names.
local map_sz2n = {
- B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16,
+ B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16, Y = 32,
}
local map_sz2prefix = {
B = "byte", W = "word", D = "dword",
Q = "qword",
- M = "qword", X = "xword",
+ M = "qword", X = "xword", Y = "yword",
F = "dword", G = "qword", -- No need for sizes/register names for these two.
}
@@ -389,10 +406,13 @@ local function putop(ctx, text, operands)
if ctx.rep then text = ctx.rep.." "..text; ctx.rep = false end
if ctx.rex then
local t = (ctx.rexw and "w" or "")..(ctx.rexr and "r" or "")..
- (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")
- if t ~= "" then text = "rex."..t.." "..text end
+ (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")..
+ (ctx.vexl and "l" or "")
+ if ctx.vexv and ctx.vexv ~= 0 then t = t.."v"..ctx.vexv end
+ if t ~= "" then text = ctx.rex.."."..t.." "..text
+ elseif ctx.rex == "vex" then text = "v"..text end
ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
- ctx.rex = false
+ ctx.rex = false; ctx.vexl = false; ctx.vexv = false
end
if ctx.seg then
local text2, n = gsub(text, "%[", "["..ctx.seg..":")
@@ -407,6 +427,7 @@ local function putop(ctx, text, operands)
end
ctx.out(format("%08x %s%s\n", ctx.addr+ctx.start, hex, text))
ctx.mrm = false
+ ctx.vexv = false
ctx.start = pos
ctx.imm = nil
end
@@ -415,7 +436,7 @@ end
local function clearprefixes(ctx)
ctx.o16 = false; ctx.seg = false; ctx.lock = false; ctx.rep = false
ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
- ctx.rex = false; ctx.a32 = false
+ ctx.rex = false; ctx.a32 = false; ctx.vexl = false
end
-- Fallback for incomplete opcodes at the end.
@@ -452,9 +473,9 @@ end
-- Process pattern string and generate the operands.
local function putpat(ctx, name, pat)
local operands, regs, sz, mode, sp, rm, sc, rx, sdisp
- local code, pos, stop = ctx.code, ctx.pos, ctx.stop
+ local code, pos, stop, vexl = ctx.code, ctx.pos, ctx.stop, ctx.vexl
- -- Chars used: 1DFGIMPQRSTUVWXacdfgijmoprstuwxyz
+ -- Chars used: 1DFGIMPQRSTUVWXYabcdfgijlmoprstuvwxyz
for p in gmatch(pat, ".") do
local x = nil
if p == "V" or p == "U" then
@@ -469,11 +490,13 @@ local function putpat(ctx, name, pat)
elseif p == "B" then
sz = "B"
regs = ctx.rex and map_regs.B64 or map_regs.B
- elseif match(p, "[WDQMXFG]") then
+ elseif match(p, "[WDQMXYFG]") then
sz = p
+ if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end
regs = map_regs[sz]
elseif p == "P" then
sz = ctx.o16 and "X" or "M"; ctx.o16 = false
+ if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end
regs = map_regs[sz]
elseif p == "S" then
name = name..lower(sz)
@@ -486,6 +509,10 @@ local function putpat(ctx, name, pat)
local imm = getimm(ctx, pos, 1); if not imm then return end
x = format("0x%02x", imm)
pos = pos+1
+ elseif p == "b" then
+ local imm = getimm(ctx, pos, 1); if not imm then return end
+ x = regs[imm/16+1]
+ pos = pos+1
elseif p == "w" then
local imm = getimm(ctx, pos, 2); if not imm then return end
x = format("0x%x", imm)
@@ -618,8 +645,13 @@ local function putpat(ctx, name, pat)
else
x = "CR"..sp
end
+ elseif p == "v" then
+ if ctx.vexv then
+ x = regs[ctx.vexv+1]; ctx.vexv = false
+ end
elseif p == "y" then x = "DR"..sp
elseif p == "z" then x = "TR"..sp
+ elseif p == "l" then vexl = false
elseif p == "t" then
else
error("bad pattern `"..pat.."'")
@@ -694,7 +726,7 @@ map_act = {
B = putpat, W = putpat, D = putpat, Q = putpat,
V = putpat, U = putpat, T = putpat,
M = putpat, X = putpat, P = putpat,
- F = putpat, G = putpat,
+ F = putpat, G = putpat, Y = putpat,
-- Collect prefixes.
[":"] = function(ctx, name, pat)
@@ -755,15 +787,68 @@ map_act = {
-- REX prefix.
rex = function(ctx, name, pat)
- if ctx.rex then return unknown(ctx) end -- Only 1 REX prefix allowed.
+ if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed.
for p in gmatch(pat, ".") do ctx["rex"..p] = true end
- ctx.rex = true
+ ctx.rex = "rex"
+ end,
+
+ -- VEX prefix.
+ vex = function(ctx, name, pat)
+ if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed.
+ ctx.rex = "vex"
+ local pos = ctx.pos
+ if ctx.mrm then
+ ctx.mrm = nil
+ pos = pos-1
+ end
+ local b = byte(ctx.code, pos, pos)
+ if not b then return incomplete(ctx) end
+ pos = pos+1
+ if b < 128 then ctx.rexr = true end
+ local m = 1
+ if pat == "3" then
+ m = b%32; b = (b-m)/32
+ local nb = b%2; b = (b-nb)/2
+ if nb == 0 then ctx.rexb = true end
+ local nx = b%2; b = (b-nx)/2
+ if nx == 0 then ctx.rexx = true end
+ b = byte(ctx.code, pos, pos)
+ if not b then return incomplete(ctx) end
+ pos = pos+1
+ if b >= 128 then ctx.rexw = true end
+ end
+ ctx.pos = pos
+ local map
+ if m == 1 then map = map_opc2
+ elseif m == 2 then map = map_opc3["38"]
+ elseif m == 3 then map = map_opc3["3a"]
+ else return unknown(ctx) end
+ local p = b%4; b = (b-p)/4
+ if p == 1 then ctx.o16 = "o16"
+ elseif p == 2 then ctx.rep = "rep"
+ elseif p == 3 then ctx.rep = "repne" end
+ local l = b%2; b = (b-l)/2
+ if l ~= 0 then ctx.vexl = true end
+ ctx.vexv = (-1-b)%16
+ return dispatchmap(ctx, map)
end,
-- Special case for nop with REX prefix.
nop = function(ctx, name, pat)
return dispatch(ctx, ctx.rex and pat or "nop")
end,
+
+ -- Special case for 0F 77.
+ emms = function(ctx, name, pat)
+ if ctx.rex ~= "vex" then
+ return putop(ctx, "emms")
+ elseif ctx.vexl then
+ ctx.vexl = false
+ return putop(ctx, "zeroall")
+ else
+ return putop(ctx, "zeroupper")
+ end
+ end,
}
------------------------------------------------------------------------------
diff --git a/src/jit/dump.lua b/src/jit/dump.lua
index 5f85849..b1cdcfe 100644
--- a/src/jit/dump.lua
+++ b/src/jit/dump.lua
@@ -571,6 +571,7 @@ local function dump_trace(what, tr, func, pc, otr, oex)
end
if dumpmode.H then out:write("</pre>\n\n") else out:write("\n") end
else
+ if what == "flush" then symtab, nexitsym = {}, 0 end
out:write("---- TRACE ", what, "\n\n")
end
out:flush()
diff --git a/src/lib_base.c b/src/lib_base.c
index 35ccdbc..ca268b1 100644
--- a/src/lib_base.c
+++ b/src/lib_base.c
@@ -428,20 +428,20 @@ LJLIB_CF(dofile)
LJLIB_CF(gcinfo)
{
- setintV(L->top++, (G(L)->gc.total >> 10));
+ setintV(L->top++, (int32_t)(G(L)->gc.total >> 10));
return 1;
}
LJLIB_CF(collectgarbage)
{
int opt = lj_lib_checkopt(L, 1, LUA_GCCOLLECT, /* ORDER LUA_GC* */
- "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul");
+ "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul\1\377\11isrunning");
int32_t data = lj_lib_optint(L, 2, 0);
if (opt == LUA_GCCOUNT) {
setnumV(L->top, (lua_Number)G(L)->gc.total/1024.0);
} else {
int res = lua_gc(L, opt, data);
- if (opt == LUA_GCSTEP)
+ if (opt == LUA_GCSTEP || opt == LUA_GCISRUNNING)
setboolV(L->top, res);
else
setintV(L->top, res);
diff --git a/src/lib_io.c b/src/lib_io.c
index 468d327..2aa8347 100644
--- a/src/lib_io.c
+++ b/src/lib_io.c
@@ -99,7 +99,7 @@ static int io_file_close(lua_State *L, IOFileUD *iof)
int stat = -1;
#if LJ_TARGET_POSIX
stat = pclose(iof->fp);
-#elif LJ_TARGET_WINDOWS
+#elif LJ_TARGET_WINDOWS && !LJ_TARGET_XBOXONE
stat = _pclose(iof->fp);
#else
lua_assert(0);
@@ -273,6 +273,15 @@ static int io_file_iter(lua_State *L)
return n;
}
+static int io_file_lines(lua_State *L)
+{
+ int n = (int)(L->top - L->base);
+ if (n > LJ_MAX_UPVAL)
+ lj_err_caller(L, LJ_ERR_UNPACK);
+ lua_pushcclosure(L, io_file_iter, n);
+ return 1;
+}
+
/* -- I/O file methods ---------------------------------------------------- */
#define LJLIB_MODULE_io_method
@@ -356,8 +365,7 @@ LJLIB_CF(io_method_setvbuf)
LJLIB_CF(io_method_lines)
{
io_tofile(L);
- lua_pushcclosure(L, io_file_iter, (int)(L->top - L->base));
- return 1;
+ return io_file_lines(L);
}
LJLIB_CF(io_method___gc)
@@ -400,7 +408,7 @@ LJLIB_CF(io_open)
LJLIB_CF(io_popen)
{
-#if LJ_TARGET_POSIX || LJ_TARGET_WINDOWS
+#if LJ_TARGET_POSIX || (LJ_TARGET_WINDOWS && !LJ_TARGET_XBOXONE)
const char *fname = strdata(lj_lib_checkstr(L, 1));
GCstr *s = lj_lib_optstr(L, 2);
const char *mode = s ? strdata(s) : "r";
@@ -487,8 +495,7 @@ LJLIB_CF(io_lines)
} else { /* io.lines() iterates over stdin. */
setudataV(L, L->base, IOSTDF_UD(L, GCROOT_IO_INPUT));
}
- lua_pushcclosure(L, io_file_iter, (int)(L->top - L->base));
- return 1;
+ return io_file_lines(L);
}
LJLIB_CF(io_type)
diff --git a/src/lib_os.c b/src/lib_os.c
index 7b5873a..37d7d5b 100644
--- a/src/lib_os.c
+++ b/src/lib_os.c
@@ -39,7 +39,7 @@
LJLIB_CF(os_execute)
{
-#if LJ_TARGET_CONSOLE
+#if LJ_NO_SYSTEM
#if LJ_52
errno = ENOSYS;
return luaL_fileresult(L, 0, NULL);
diff --git a/src/lib_package.c b/src/lib_package.c
index 6b6eb8f..32ba4d3 100644
--- a/src/lib_package.c
+++ b/src/lib_package.c
@@ -96,9 +96,17 @@ static void setprogdir(lua_State *L)
static void pusherror(lua_State *L)
{
DWORD error = GetLastError();
+#if LJ_TARGET_XBOXONE
+ wchar_t wbuffer[128];
+ char buffer[128*2];
+ if (FormatMessageW(FORMAT_MESSAGE_IGNORE_INSERTS | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL, error, 0, wbuffer, sizeof(wbuffer)/sizeof(wchar_t), NULL) &&
+ WideCharToMultiByte(CP_ACP, 0, wbuffer, 128, buffer, 128*2, NULL, NULL))
+#else
char buffer[128];
if (FormatMessageA(FORMAT_MESSAGE_IGNORE_INSERTS | FORMAT_MESSAGE_FROM_SYSTEM,
NULL, error, 0, buffer, sizeof(buffer), NULL))
+#endif
lua_pushstring(L, buffer);
else
lua_pushfstring(L, "system error %d\n", error);
@@ -111,7 +119,7 @@ static void ll_unloadlib(void *lib)
static void *ll_load(lua_State *L, const char *path, int gl)
{
- HINSTANCE lib = LoadLibraryA(path);
+ HINSTANCE lib = LoadLibraryExA(path, NULL, 0);
if (lib == NULL) pusherror(L);
UNUSED(gl);
return lib;
diff --git a/src/lj.supp b/src/lj.supp
index 411f261..acb9e78 100644
--- a/src/lj.supp
+++ b/src/lj.supp
@@ -24,3 +24,18 @@
Memcheck:Cond
fun:lj_str_new
}
+{
+ Optimized string compare
+ Memcheck:Addr4
+ fun:lj_str_fastcmp
+}
+{
+ Optimized string compare
+ Memcheck:Addr1
+ fun:lj_str_fastcmp
+}
+{
+ Optimized string compare
+ Memcheck:Cond
+ fun:lj_str_fastcmp
+}
diff --git a/src/lj_alloc.c b/src/lj_alloc.c
index 0aad826..ddd50ca 100644
--- a/src/lj_alloc.c
+++ b/src/lj_alloc.c
@@ -196,7 +196,7 @@ static LJ_AINLINE void *CALL_MMAP(size_t size)
return ptr;
}
-#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__)
+#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__) || defined(__CYGWIN__)
/* OSX and FreeBSD mmap() use a naive first-fit linear search.
** That's perfect for us. Except that -pagezero_size must be set for OSX,
diff --git a/src/lj_api.c b/src/lj_api.c
index 1f09284..042b0d9 100644
--- a/src/lj_api.c
+++ b/src/lj_api.c
@@ -1188,6 +1188,9 @@ LUA_API int lua_gc(lua_State *L, int what, int data)
res = (int)(g->gc.stepmul);
g->gc.stepmul = (MSize)data;
break;
+ case LUA_GCISRUNNING:
+ res = (g->gc.threshold != LJ_MAX_MEM);
+ break;
default:
res = -1; /* Invalid option. */
}
diff --git a/src/lj_arch.h b/src/lj_arch.h
index 61c7e19..c66a11c 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -120,6 +120,12 @@
#define LJ_TARGET_CONSOLE 1
#endif
+#ifdef _DURANGO
+#define LJ_TARGET_XBOXONE 1
+#define LJ_TARGET_CONSOLE 1
+#define LJ_TARGET_GC64 1
+#endif
+
#define LJ_NUMMODE_SINGLE 0 /* Single-number mode only. */
#define LJ_NUMMODE_SINGLE_DUAL 1 /* Default to single-number mode. */
#define LJ_NUMMODE_DUAL 2 /* Dual-number mode only. */
@@ -149,7 +155,11 @@
#define LJ_ARCH_NAME "x64"
#define LJ_ARCH_BITS 64
#define LJ_ARCH_ENDIAN LUAJIT_LE
-#define LJ_ABI_WIN LJ_TARGET_WINDOWS
+#if LJ_TARGET_WINDOWS || __CYGWIN__
+#define LJ_ABI_WIN 1
+#else
+#define LJ_ABI_WIN 0
+#endif
#define LJ_TARGET_X64 1
#define LJ_TARGET_X86ORX64 1
#define LJ_TARGET_EHRETREG 0
@@ -158,6 +168,9 @@
#define LJ_TARGET_MASKROT 1
#define LJ_TARGET_UNALIGNED 1
#define LJ_ARCH_NUMMODE LJ_NUMMODE_SINGLE_DUAL
+#ifdef LUAJIT_ENABLE_GC64
+#define LJ_TARGET_GC64 1
+#endif
#elif LUAJIT_TARGET == LUAJIT_ARCH_ARM
@@ -210,11 +223,13 @@
#elif LUAJIT_TARGET == LUAJIT_ARCH_PPC
+#ifndef LJ_ARCH_ENDIAN
#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
#define LJ_ARCH_ENDIAN LUAJIT_LE
#else
#define LJ_ARCH_ENDIAN LUAJIT_BE
#endif
+#endif
#if _LP64
#define LJ_ARCH_BITS 64
@@ -483,8 +498,11 @@
#if defined(__symbian__) || LJ_TARGET_WINDOWS
#define LUAJIT_NO_EXP2
#endif
+#if LJ_TARGET_CONSOLE || (LJ_TARGET_IOS && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0)
+#define LJ_NO_SYSTEM 1
+#endif
-#if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3
+#if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4
#define LJ_NO_UNWIND 1
#endif
diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c
index 66a0944..065c329 100644
--- a/src/lj_ccallback.c
+++ b/src/lj_ccallback.c
@@ -35,7 +35,7 @@
#elif LJ_TARGET_X86ORX64
#define CALLBACK_MCODE_HEAD (LJ_64 ? 8 : 0)
-#define CALLBACK_MCODE_GROUP (-2+1+2+5+(LJ_64 ? 6 : 5))
+#define CALLBACK_MCODE_GROUP (-2+1+2+(LJ_GC64 ? 10 : 5)+(LJ_64 ? 6 : 5))
#define CALLBACK_SLOT2OFS(slot) \
(CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/32) + 4*(slot))
@@ -120,8 +120,13 @@ static void callback_mcode_init(global_State *g, uint8_t *page)
/* push ebp/rbp; mov ah, slot>>8; mov ebp, &g. */
*p++ = XI_PUSH + RID_EBP;
*p++ = XI_MOVrib | (RID_EAX+4); *p++ = (uint8_t)(slot >> 8);
+#if LJ_GC64
+ *p++ = 0x48; *p++ = XI_MOVri | RID_EBP;
+ *(uint64_t *)p = (uint64_t)(g); p += 8;
+#else
*p++ = XI_MOVri | RID_EBP;
*(int32_t *)p = i32ptr(g); p += 4;
+#endif
#if LJ_64
/* jmp [rip-pageofs] where lj_vm_ffi_callback is stored. */
*p++ = XI_GROUP5; *p++ = XM_OFS0 + (XOg_JMP<<3) + RID_EBP;
diff --git a/src/lj_cdata.c b/src/lj_cdata.c
index fccf7f1..5cd2c11 100644
--- a/src/lj_cdata.c
+++ b/src/lj_cdata.c
@@ -121,7 +121,12 @@ collect_attrib:
idx = (ptrdiff_t)intV(key);
goto integer_key;
} else if (tvisnum(key)) { /* Numeric key. */
- idx = LJ_64 ? (ptrdiff_t)numV(key) : (ptrdiff_t)lj_num2int(numV(key));
+#ifdef _MSC_VER
+ /* Workaround for MSVC bug. */
+ volatile
+#endif
+ lua_Number n = numV(key);
+ idx = LJ_64 ? (ptrdiff_t)n : (ptrdiff_t)lj_num2int(n);
integer_key:
if (ctype_ispointer(ct->info)) {
CTSize sz = lj_ctype_size(cts, ctype_cid(ct->info)); /* Element size. */
diff --git a/src/lj_clib.c b/src/lj_clib.c
index 6bdad67..1e927eb 100644
--- a/src/lj_clib.c
+++ b/src/lj_clib.c
@@ -172,9 +172,17 @@ LJ_NORET LJ_NOINLINE static void clib_error(lua_State *L, const char *fmt,
const char *name)
{
DWORD err = GetLastError();
+#if LJ_TARGET_XBOXONE
+ wchar_t wbuf[128];
+ char buf[128*2];
+ if (!FormatMessageW(FORMAT_MESSAGE_IGNORE_INSERTS|FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL, err, 0, wbuf, sizeof(wbuf)/sizeof(wchar_t), NULL) ||
+ !WideCharToMultiByte(CP_ACP, 0, wbuf, 128, buf, 128*2, NULL, NULL))
+#else
char buf[128];
if (!FormatMessageA(FORMAT_MESSAGE_IGNORE_INSERTS|FORMAT_MESSAGE_FROM_SYSTEM,
NULL, err, 0, buf, sizeof(buf), NULL))
+#endif
buf[0] = '\0';
lj_err_callermsg(L, lj_strfmt_pushf(L, fmt, name, buf));
}
@@ -200,7 +208,7 @@ static const char *clib_extname(lua_State *L, const char *name)
static void *clib_loadlib(lua_State *L, const char *name, int global)
{
DWORD oldwerr = GetLastError();
- void *h = (void *)LoadLibraryA(clib_extname(L, name));
+ void *h = (void *)LoadLibraryExA(clib_extname(L, name), NULL, 0);
if (!h) clib_error(L, "cannot load module " LUA_QS ": %s", name);
SetLastError(oldwerr);
UNUSED(global);
@@ -241,9 +249,9 @@ static void *clib_getsym(CLibrary *cl, const char *name)
GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS|GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
(const char *)&_fmode, &h);
break;
- case CLIB_HANDLE_KERNEL32: h = LoadLibraryA("kernel32.dll"); break;
- case CLIB_HANDLE_USER32: h = LoadLibraryA("user32.dll"); break;
- case CLIB_HANDLE_GDI32: h = LoadLibraryA("gdi32.dll"); break;
+ case CLIB_HANDLE_KERNEL32: h = LoadLibraryExA("kernel32.dll", NULL, 0); break;
+ case CLIB_HANDLE_USER32: h = LoadLibraryExA("user32.dll", NULL, 0); break;
+ case CLIB_HANDLE_GDI32: h = LoadLibraryExA("gdi32.dll", NULL, 0); break;
}
if (!h) continue;
clib_def_handle[i] = (void *)h;
diff --git a/src/lj_cparse.c b/src/lj_cparse.c
index 1ec3230..f212bd3 100644
--- a/src/lj_cparse.c
+++ b/src/lj_cparse.c
@@ -784,6 +784,10 @@ static void cp_push_type(CPDecl *decl, CTypeID id)
cp_push(decl, info & ~CTMASK_CID, size); /* Copy type. */
break;
case CT_ARRAY:
+ if ((ct->info & (CTF_VECTOR|CTF_COMPLEX))) {
+ info |= (decl->attr & CTF_QUAL);
+ decl->attr &= ~CTF_QUAL;
+ }
cp_push_type(decl, ctype_cid(info)); /* Unroll. */
cp_push(decl, info & ~CTMASK_CID, size); /* Copy type. */
decl->stack[decl->pos].sib = 1; /* Mark as already checked and sized. */
diff --git a/src/lj_ctype.c b/src/lj_ctype.c
index 2e23c99..eda070c 100644
--- a/src/lj_ctype.c
+++ b/src/lj_ctype.c
@@ -38,6 +38,8 @@
_("uint64_t", UINT64) \
_("intptr_t", INT_PSZ) \
_("uintptr_t", UINT_PSZ) \
+ /* From POSIX. */ \
+ _("ssize_t", INT_PSZ) \
/* End of typedef list. */
/* Keywords (only the ones we actually care for). */
diff --git a/src/lj_err.c b/src/lj_err.c
index 4f13494..9ac0c98 100644
--- a/src/lj_err.c
+++ b/src/lj_err.c
@@ -58,10 +58,10 @@
** EXT cannot be enabled on WIN32 since system exceptions use code-driven SEH.
** EXT is mandatory on WIN64 since the calling convention has an abundance
** of callee-saved registers (rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15).
-** EXT is mandatory on POSIX/x64 since the interpreter doesn't save r12/r13.
+** The POSIX/x64 interpreter only saves r12/r13 for INT (e.g. PS4).
*/
-#if defined(__GNUC__) && (LJ_TARGET_X64 || defined(LUAJIT_UNWIND_EXTERNAL))
+#if defined(__GNUC__) && (LJ_TARGET_X64 || defined(LUAJIT_UNWIND_EXTERNAL)) && !LJ_NO_UNWIND
#define LJ_UNWIND_EXT 1
#elif LJ_TARGET_X64 && LJ_TARGET_WINDOWS
#define LJ_UNWIND_EXT 1
@@ -119,7 +119,7 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
if (errcode) {
L->base = frame_prevd(frame) + 1;
L->cframe = cframe_prev(cf);
- unwindstack(L, frame);
+ unwindstack(L, frame - LJ_FR2);
} else if (cf != stopcf) {
cf = cframe_prev(cf);
frame = frame_prevd(frame);
@@ -144,7 +144,7 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
if (errcode) {
L->base = frame_prevd(frame) + 1;
L->cframe = cframe_prev(cf);
- unwindstack(L, frame);
+ unwindstack(L, frame - LJ_FR2);
}
return cf;
case FRAME_CONT: /* Continuation frame. */
@@ -183,7 +183,7 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
/* -- External frame unwinding -------------------------------------------- */
-#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_TARGET_WINDOWS
+#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_ABI_WIN
/*
** We have to use our own definitions instead of the mandatory (!) unwind.h,
@@ -349,7 +349,7 @@ LJ_FUNCA int lj_err_unwind_arm(int state, void *ucb, _Unwind_Context *ctx)
#endif
-#elif LJ_TARGET_X64 && LJ_TARGET_WINDOWS
+#elif LJ_TARGET_X64 && LJ_ABI_WIN
/*
** Someone in Redmond owes me several days of my life. A lot of this is
@@ -414,7 +414,9 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec,
if (cf2) { /* We catch it, so start unwinding the upper frames. */
if (rec->ExceptionCode == LJ_MSVC_EXCODE ||
rec->ExceptionCode == LJ_GCC_EXCODE) {
+#if LJ_TARGET_WINDOWS
__DestructExceptionObject(rec, 1);
+#endif
setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
} else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) {
/* Don't catch access violations etc. */
diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c
index e17f681..281f017 100644
--- a/src/lj_ffrecord.c
+++ b/src/lj_ffrecord.c
@@ -100,7 +100,6 @@ static ptrdiff_t results_wanted(jit_State *J)
static void recff_stitch(jit_State *J)
{
ASMFunction cont = lj_cont_stitch;
- TraceNo traceno = J->cur.traceno;
lua_State *L = J->L;
TValue *base = L->base;
const BCIns *pc = frame_pc(base-1);
@@ -113,7 +112,7 @@ static void recff_stitch(jit_State *J)
setframe_ftsz(base+1, ((char *)(base+1) - (char *)pframe) + FRAME_CONT);
setcont(base, cont);
setframe_pc(base, pc);
- if (LJ_DUALNUM) setintV(base-1, traceno); else base[-1].u64 = traceno;
+ setnilV(base-1); /* Incorrect, but rec_check_slots() won't run anymore. */
L->base += 2;
L->top += 2;
@@ -125,7 +124,9 @@ static void recff_stitch(jit_State *J)
trcont = lj_ir_kptr(J, (void *)cont);
#endif
J->base[0] = trcont | TREF_CONT;
- J->base[-1] = LJ_DUALNUM ? lj_ir_kint(J,traceno) : lj_ir_knum_u64(J,traceno);
+ J->ktracep = lj_ir_k64_reserve(J);
+ lua_assert(irt_toitype_(IRT_P64) == LJ_TTRACE);
+ J->base[-1] = emitir(IRT(IR_XLOAD, IRT_P64), lj_ir_kptr(J, &J->ktracep->gcr), 0);
J->base += 2;
J->baseslot += 2;
J->framedepth++;
@@ -434,11 +435,12 @@ static void LJ_FASTCALL recff_ipairs_aux(jit_State *J, RecordFFData *rd)
static void LJ_FASTCALL recff_xpairs(jit_State *J, RecordFFData *rd)
{
- if (!(LJ_52 && recff_metacall(J, rd, MM_ipairs))) {
- TRef tab = J->base[0];
- if (tref_istab(tab)) {
+ TRef tr = J->base[0];
+ if (!((LJ_52 || (LJ_HASFFI && tref_iscdata(tr))) &&
+ recff_metacall(J, rd, MM_pairs + rd->data))) {
+ if (tref_istab(tr)) {
J->base[0] = lj_ir_kfunc(J, funcV(&J->fn->c.upvalue[0]));
- J->base[1] = tab;
+ J->base[1] = tr;
J->base[2] = rd->data ? lj_ir_kint(J, 0) : TREF_NIL;
rd->nres = 3;
} /* else: Interpreter will throw. */
diff --git a/src/lj_frame.h b/src/lj_frame.h
index b9595a5..a86c36b 100644
--- a/src/lj_frame.h
+++ b/src/lj_frame.h
@@ -127,22 +127,42 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */
#elif LJ_TARGET_X64
#if LJ_ABI_WIN
#define CFRAME_OFS_PREV (13*8)
+#if LJ_GC64
+#define CFRAME_OFS_PC (12*8)
+#define CFRAME_OFS_L (11*8)
+#define CFRAME_OFS_ERRF (21*4)
+#define CFRAME_OFS_NRES (20*4)
+#define CFRAME_OFS_MULTRES (8*4)
+#else
#define CFRAME_OFS_PC (25*4)
#define CFRAME_OFS_L (24*4)
#define CFRAME_OFS_ERRF (23*4)
#define CFRAME_OFS_NRES (22*4)
#define CFRAME_OFS_MULTRES (21*4)
+#endif
#define CFRAME_SIZE (10*8)
#define CFRAME_SIZE_JIT (CFRAME_SIZE + 9*16 + 4*8)
#define CFRAME_SHIFT_MULTRES 0
#else
#define CFRAME_OFS_PREV (4*8)
+#if LJ_GC64
+#define CFRAME_OFS_PC (3*8)
+#define CFRAME_OFS_L (2*8)
+#define CFRAME_OFS_ERRF (3*4)
+#define CFRAME_OFS_NRES (2*4)
+#define CFRAME_OFS_MULTRES (0*4)
+#else
#define CFRAME_OFS_PC (7*4)
#define CFRAME_OFS_L (6*4)
#define CFRAME_OFS_ERRF (5*4)
#define CFRAME_OFS_NRES (4*4)
#define CFRAME_OFS_MULTRES (1*4)
+#endif
+#if LJ_NO_UNWIND
+#define CFRAME_SIZE (12*8)
+#else
#define CFRAME_SIZE (10*8)
+#endif
#define CFRAME_SIZE_JIT (CFRAME_SIZE + 16)
#define CFRAME_SHIFT_MULTRES 0
#endif
diff --git a/src/lj_gc.c b/src/lj_gc.c
index 99d664a..afd3997 100644
--- a/src/lj_gc.c
+++ b/src/lj_gc.c
@@ -69,7 +69,7 @@ static void gc_mark(global_State *g, GCobj *o)
gray2black(o); /* Closed upvalues are never gray. */
} else if (gct != ~LJ_TSTR && gct != ~LJ_TCDATA) {
lua_assert(gct == ~LJ_TFUNC || gct == ~LJ_TTAB ||
- gct == ~LJ_TTHREAD || gct == ~LJ_TPROTO);
+ gct == ~LJ_TTHREAD || gct == ~LJ_TPROTO || gct == ~LJ_TTRACE);
setgcrefr(o->gch.gclist, g->gc.gray);
setgcref(g->gc.gray, o);
}
diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
index c289cd8..9b95e52 100644
--- a/src/lj_gdbjit.c
+++ b/src/lj_gdbjit.c
@@ -556,8 +556,8 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
DB(DW_CFA_offset|DW_REG_15); DUV(4);
DB(DW_CFA_offset|DW_REG_14); DUV(5);
/* Extra registers saved for JIT-compiled code. */
- DB(DW_CFA_offset|DW_REG_13); DUV(9);
- DB(DW_CFA_offset|DW_REG_12); DUV(10);
+ DB(DW_CFA_offset|DW_REG_13); DUV(LJ_GC64 ? 10 : 9);
+ DB(DW_CFA_offset|DW_REG_12); DUV(LJ_GC64 ? 11 : 10);
#elif LJ_TARGET_ARM
{
int i;
diff --git a/src/lj_ir.c b/src/lj_ir.c
index 9682e05..567aec8 100644
--- a/src/lj_ir.c
+++ b/src/lj_ir.c
@@ -209,24 +209,13 @@ void lj_ir_k64_freeall(jit_State *J)
lj_mem_free(J2G(J), k, sizeof(K64Array));
k = next;
}
+ setmref(J->k64, NULL);
}
-/* Find 64 bit constant in chained array or add it. */
-cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64)
+/* Get new 64 bit constant slot. */
+static TValue *ir_k64_add(jit_State *J, K64Array *kp, uint64_t u64)
{
- K64Array *k, *kp = NULL;
TValue *ntv;
- MSize idx;
- /* Search for the constant in the whole chain of arrays. */
- for (k = mref(J->k64, K64Array); k; k = mref(k->next, K64Array)) {
- kp = k; /* Remember previous element in list. */
- for (idx = 0; idx < k->numk; idx++) { /* Search one array. */
- TValue *tv = &k->k[idx];
- if (tv->u64 == u64) /* Needed for +-0/NaN/absmask. */
- return tv;
- }
- }
- /* Constant was not found, need to add it. */
if (!(kp && kp->numk < LJ_MIN_K64SZ)) { /* Allocate a new array. */
K64Array *kn = lj_mem_newt(J->L, sizeof(K64Array), K64Array);
setmref(kn->next, NULL);
@@ -242,6 +231,33 @@ cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64)
return ntv;
}
+/* Find 64 bit constant in chained array or add it. */
+cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64)
+{
+ K64Array *k, *kp = NULL;
+ MSize idx;
+ /* Search for the constant in the whole chain of arrays. */
+ for (k = mref(J->k64, K64Array); k; k = mref(k->next, K64Array)) {
+ kp = k; /* Remember previous element in list. */
+ for (idx = 0; idx < k->numk; idx++) { /* Search one array. */
+ TValue *tv = &k->k[idx];
+ if (tv->u64 == u64) /* Needed for +-0/NaN/absmask. */
+ return tv;
+ }
+ }
+ /* Otherwise add a new constant. */
+ return ir_k64_add(J, kp, u64);
+}
+
+TValue *lj_ir_k64_reserve(jit_State *J)
+{
+ K64Array *k, *kp = NULL;
+ lj_ir_k64_find(J, 0); /* Intern dummy 0 to protect the reserved slot. */
+ /* Find last K64Array, if any. */
+ for (k = mref(J->k64, K64Array); k; k = mref(k->next, K64Array)) kp = k;
+ return ir_k64_add(J, kp, 0); /* Set to 0. Final value is set later. */
+}
+
/* Intern 64 bit constant, given by its address. */
TRef lj_ir_k64(jit_State *J, IROp op, cTValue *tv)
{
diff --git a/src/lj_iropt.h b/src/lj_iropt.h
index 4e424e7..4106ef8 100644
--- a/src/lj_iropt.h
+++ b/src/lj_iropt.h
@@ -40,6 +40,7 @@ static LJ_AINLINE IRRef lj_ir_nextins(jit_State *J)
LJ_FUNC TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k);
LJ_FUNC void lj_ir_k64_freeall(jit_State *J);
LJ_FUNC TRef lj_ir_k64(jit_State *J, IROp op, cTValue *tv);
+LJ_FUNC TValue *lj_ir_k64_reserve(jit_State *J);
LJ_FUNC cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64);
LJ_FUNC TRef lj_ir_knum_u64(jit_State *J, uint64_t u64);
LJ_FUNC TRef lj_ir_knumint(jit_State *J, lua_Number n);
diff --git a/src/lj_jit.h b/src/lj_jit.h
index 4b51bae..10900bf 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -290,6 +290,16 @@ typedef struct ScEvEntry {
uint8_t dir; /* Direction. 1: +, 0: -. */
} ScEvEntry;
+/* Reverse bytecode map (IRRef -> PC). Only for selected instructions. */
+typedef struct RBCHashEntry {
+ MRef pc; /* Bytecode PC. */
+ GCRef pt; /* Prototype. */
+ IRRef ref; /* IR reference. */
+} RBCHashEntry;
+
+/* Number of slots in the reverse bytecode hash table. Must be a power of 2. */
+#define RBCHASH_SLOTS 8
+
/* 128 bit SIMD constants. */
enum {
LJ_KSIMD_ABS,
@@ -364,12 +374,14 @@ typedef struct jit_State {
PostProc postproc; /* Required post-processing after execution. */
#if LJ_SOFTFP || (LJ_32 && LJ_HASFFI)
- int needsplit; /* Need SPLIT pass. */
+ uint8_t needsplit; /* Need SPLIT pass. */
#endif
+ uint8_t retryrec; /* Retry recording. */
GCRef *trace; /* Array of traces. */
TraceNo freetrace; /* Start of scan for next free trace. */
MSize sizetrace; /* Size of trace array. */
+ TValue *ktracep; /* Pointer to K64Array slot with GCtrace pointer. */
IRRef1 chain[IR__MAX]; /* IR instruction skip-list chain anchors. */
TRef slot[LJ_MAX_JSLOTS+LJ_STACK_EXTRA]; /* Stack slot map. */
@@ -382,6 +394,10 @@ typedef struct jit_State {
uint32_t penaltyslot; /* Round-robin index into penalty slots. */
uint32_t prngstate; /* PRNG state. */
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ RBCHashEntry rbchash[RBCHASH_SLOTS]; /* Reverse bytecode map. */
+#endif
+
BPropEntry bpropcache[BPROP_SLOTS]; /* Backpropagation cache slots. */
uint32_t bpropslot; /* Round-robin index into bpropcache slots. */
diff --git a/src/lj_lex.c b/src/lj_lex.c
index 8409cd7..5a918f7 100644
--- a/src/lj_lex.c
+++ b/src/lj_lex.c
@@ -214,6 +214,33 @@ static void lex_string(LexState *ls, TValue *tv)
c += 9;
}
break;
+ case 'u': /* Unicode escape '\u{XX...}'. */
+ if (lex_next(ls) != '{') goto err_xesc;
+ lex_next(ls);
+ c = 0;
+ do {
+ c = (c << 4) | (ls->c & 15u);
+ if (!lj_char_isdigit(ls->c)) {
+ if (!lj_char_isxdigit(ls->c)) goto err_xesc;
+ c += 9;
+ }
+ if (c >= 0x110000) goto err_xesc; /* Out of Unicode range. */
+ } while (lex_next(ls) != '}');
+ if (c < 0x800) {
+ if (c < 0x80) break;
+ lex_save(ls, 0xc0 | (c >> 6));
+ } else {
+ if (c >= 0x10000) {
+ lex_save(ls, 0xf0 | (c >> 18));
+ lex_save(ls, 0x80 | ((c >> 12) & 0x3f));
+ } else {
+ if (c >= 0xd800 && c < 0xe000) goto err_xesc; /* No surrogates. */
+ lex_save(ls, 0xe0 | (c >> 12));
+ }
+ lex_save(ls, 0x80 | ((c >> 6) & 0x3f));
+ }
+ c = 0x80 | (c & 0x3f);
+ break;
case 'z': /* Skip whitespace. */
lex_next(ls);
while (lj_char_isspace(ls->c))
diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c
index d221c30..d199345 100644
--- a/src/lj_opt_narrow.c
+++ b/src/lj_opt_narrow.c
@@ -205,7 +205,6 @@ typedef struct NarrowConv {
jit_State *J; /* JIT compiler state. */
NarrowIns *sp; /* Current stack pointer. */
NarrowIns *maxsp; /* Maximum stack pointer minus redzone. */
- int lim; /* Limit on the number of emitted conversions. */
IRRef mode; /* Conversion mode (IRCONV_*). */
IRType t; /* Destination type: IRT_INT or IRT_I64. */
NarrowIns stack[NARROW_MAX_STACK]; /* Stack holding stack-machine code. */
@@ -247,10 +246,16 @@ static void narrow_stripov_backprop(NarrowConv *nc, IRRef ref, int depth)
if (bp) {
ref = bp->val;
} else if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
+ NarrowIns *savesp = nc->sp;
narrow_stripov_backprop(nc, ir->op1, depth);
- narrow_stripov_backprop(nc, ir->op2, depth);
- *nc->sp++ = NARROWINS(IRT(ir->o - IR_ADDOV + IR_ADD, IRT_INT), ref);
- return;
+ if (nc->sp < nc->maxsp) {
+ narrow_stripov_backprop(nc, ir->op2, depth);
+ if (nc->sp < nc->maxsp) {
+ *nc->sp++ = NARROWINS(IRT(ir->o - IR_ADDOV + IR_ADD, IRT_INT), ref);
+ return;
+ }
+ }
+ nc->sp = savesp; /* Path too deep, need to backtrack. */
}
}
*nc->sp++ = NARROWINS(NARROW_REF, ref);
@@ -263,6 +268,8 @@ static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
IRIns *ir = IR(ref);
IRRef cref;
+ if (nc->sp >= nc->maxsp) return 10; /* Path too deep. */
+
/* Check the easy cases first. */
if (ir->o == IR_CONV && (ir->op2 & IRCONV_SRCMASK) == IRT_INT) {
if ((nc->mode & IRCONV_CONVMASK) <= IRCONV_ANY)
@@ -334,7 +341,7 @@ static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
NarrowIns *savesp = nc->sp;
int count = narrow_conv_backprop(nc, ir->op1, depth);
count += narrow_conv_backprop(nc, ir->op2, depth);
- if (count <= nc->lim) { /* Limit total number of conversions. */
+ if (count <= 1) { /* Limit total number of conversions. */
*nc->sp++ = NARROWINS(IRT(ir->o, nc->t), ref);
return count;
}
@@ -406,12 +413,10 @@ TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J)
nc.t = irt_type(fins->t);
if (fins->o == IR_TOBIT) {
nc.mode = IRCONV_TOBIT; /* Used only in the backpropagation cache. */
- nc.lim = 2; /* TOBIT can use a more optimistic rule. */
} else {
nc.mode = fins->op2;
- nc.lim = 1;
}
- if (narrow_conv_backprop(&nc, fins->op1, 0) <= nc.lim)
+ if (narrow_conv_backprop(&nc, fins->op1, 0) <= 1)
return narrow_conv_emit(J, &nc);
}
return NEXTFOLD;
diff --git a/src/lj_profile.c b/src/lj_profile.c
index 0136701..c7e5396 100644
--- a/src/lj_profile.c
+++ b/src/lj_profile.c
@@ -266,7 +266,7 @@ static void profile_timer_start(ProfileState *ps)
{
#if LJ_TARGET_WINDOWS
if (!ps->wmm) { /* Load WinMM library on-demand. */
- ps->wmm = LoadLibraryA("winmm.dll");
+ ps->wmm = LoadLibraryExA("winmm.dll", NULL, 0);
if (ps->wmm) {
ps->wmm_tbp = (WMM_TPFUNC)GetProcAddress(ps->wmm, "timeBeginPeriod");
ps->wmm_tep = (WMM_TPFUNC)GetProcAddress(ps->wmm, "timeEndPeriod");
diff --git a/src/lj_record.c b/src/lj_record.c
index 5603815..dc5f2d5 100644
--- a/src/lj_record.c
+++ b/src/lj_record.c
@@ -235,6 +235,10 @@ static void canonicalize_slots(jit_State *J)
/* Stop recording. */
void lj_record_stop(jit_State *J, TraceLink linktype, TraceNo lnk)
{
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ if (J->retryrec)
+ lj_trace_err(J, LJ_TRERR_RETRY);
+#endif
lj_trace_end(J);
J->cur.linktype = (uint8_t)linktype;
J->cur.link = (uint16_t)lnk;
@@ -1127,6 +1131,72 @@ static void rec_mm_comp_cdata(jit_State *J, RecordIndex *ix, int op, MMS mm)
/* -- Indexed access ------------------------------------------------------ */
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+/* Bump table allocations in bytecode when they grow during recording. */
+static void rec_idx_bump(jit_State *J, RecordIndex *ix)
+{
+ RBCHashEntry *rbc = &J->rbchash[(ix->tab & (RBCHASH_SLOTS-1))];
+ if (tref_ref(ix->tab) == rbc->ref) {
+ const BCIns *pc = mref(rbc->pc, const BCIns);
+ GCtab *tb = tabV(&ix->tabv);
+ uint32_t nhbits;
+ IRIns *ir;
+ if (!tvisnil(&ix->keyv))
+ (void)lj_tab_set(J->L, tb, &ix->keyv); /* Grow table right now. */
+ nhbits = tb->hmask > 0 ? lj_fls(tb->hmask)+1 : 0;
+ ir = IR(tref_ref(ix->tab));
+ if (ir->o == IR_TNEW) {
+ uint32_t ah = bc_d(*pc);
+ uint32_t asize = ah & 0x7ff, hbits = ah >> 11;
+ if (nhbits > hbits) hbits = nhbits;
+ if (tb->asize > asize) {
+ asize = tb->asize <= 0x7ff ? tb->asize : 0x7ff;
+ }
+ if ((asize | (hbits<<11)) != ah) { /* Has the size changed? */
+ /* Patch bytecode, but continue recording (for more patching). */
+ setbc_d(pc, (asize | (hbits<<11)));
+ /* Patching TNEW operands is only safe if the trace is aborted. */
+ ir->op1 = asize; ir->op2 = hbits;
+ J->retryrec = 1; /* Abort the trace at the end of recording. */
+ }
+ } else if (ir->o == IR_TDUP) {
+ GCtab *tpl = gco2tab(proto_kgc(&gcref(rbc->pt)->pt, ~(ptrdiff_t)bc_d(*pc)));
+ /* Grow template table, but preserve keys with nil values. */
+ if ((tb->asize > tpl->asize && (1u << nhbits)-1 == tpl->hmask) ||
+ (tb->asize == tpl->asize && (1u << nhbits)-1 > tpl->hmask)) {
+ Node *node = noderef(tpl->node);
+ uint32_t i, hmask = tpl->hmask, asize;
+ TValue *array;
+ for (i = 0; i <= hmask; i++) {
+ if (!tvisnil(&node[i].key) && tvisnil(&node[i].val))
+ settabV(J->L, &node[i].val, tpl);
+ }
+ if (!tvisnil(&ix->keyv) && tref_isk(ix->key)) {
+ TValue *o = lj_tab_set(J->L, tpl, &ix->keyv);
+ if (tvisnil(o)) settabV(J->L, o, tpl);
+ }
+ lj_tab_resize(J->L, tpl, tb->asize, nhbits);
+ node = noderef(tpl->node);
+ hmask = tpl->hmask;
+ for (i = 0; i <= hmask; i++) {
+ /* This is safe, since template tables only hold immutable values. */
+ if (tvistab(&node[i].val))
+ setnilV(&node[i].val);
+ }
+ /* The shape of the table may have changed. Clean up array part, too. */
+ asize = tpl->asize;
+ array = tvref(tpl->array);
+ for (i = 0; i < asize; i++) {
+ if (tvistab(&array[i]))
+ setnilV(&array[i]);
+ }
+ J->retryrec = 1; /* Abort the trace at the end of recording. */
+ }
+ }
+ }
+}
+#endif
+
/* Record bounds-check. */
static void rec_idx_abc(jit_State *J, TRef asizeref, TRef ikey, uint32_t asize)
{
@@ -1352,6 +1422,10 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix)
key = emitir(IRTN(IR_CONV), key, IRCONV_NUM_INT);
xref = emitir(IRT(IR_NEWREF, IRT_P32), ix->tab, key);
keybarrier = 0; /* NEWREF already takes care of the key barrier. */
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ if ((J->flags & JIT_F_OPT_SINK)) /* Avoid a separate flag. */
+ rec_idx_bump(J, ix);
+#endif
}
} else if (!lj_opt_fwd_wasnonnil(J, loadop, tref_ref(xref))) {
/* Cannot derive that the previous value was non-nil, must do checks. */
@@ -1390,9 +1464,18 @@ static void rec_tsetm(jit_State *J, BCReg ra, BCReg rn, int32_t i)
{
RecordIndex ix;
cTValue *basev = J->L->base;
- copyTV(J->L, &ix.tabv, &basev[ra-1]);
+ GCtab *t = tabV(&basev[ra-1]);
+ settabV(J->L, &ix.tabv, t);
ix.tab = getslot(J, ra-1);
ix.idxchain = 0;
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ if ((J->flags & JIT_F_OPT_SINK)) {
+ if (t->asize < i+rn-ra)
+ lj_tab_reasize(J->L, t, i+rn-ra);
+ setnilV(&ix.keyv);
+ rec_idx_bump(J, &ix);
+ }
+#endif
for (; ra < rn; i++, ra++) {
setintV(&ix.keyv, i);
ix.key = lj_ir_kint(J, i);
@@ -1712,8 +1795,15 @@ static TRef rec_tnew(jit_State *J, uint32_t ah)
{
uint32_t asize = ah & 0x7ff;
uint32_t hbits = ah >> 11;
+ TRef tr;
if (asize == 0x7ff) asize = 0x801;
- return emitir(IRTG(IR_TNEW, IRT_TAB), asize, hbits);
+ tr = emitir(IRTG(IR_TNEW, IRT_TAB), asize, hbits);
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ J->rbchash[(tr & (RBCHASH_SLOTS-1))].ref = tref_ref(tr);
+ setmref(J->rbchash[(tr & (RBCHASH_SLOTS-1))].pc, J->pc);
+ setgcref(J->rbchash[(tr & (RBCHASH_SLOTS-1))].pt, obj2gco(J->pt));
+#endif
+ return tr;
}
/* -- Concatenation ------------------------------------------------------- */
@@ -2139,6 +2229,11 @@ void lj_record_ins(jit_State *J)
case BC_TDUP:
rc = emitir(IRTG(IR_TDUP, IRT_TAB),
lj_ir_ktab(J, gco2tab(proto_kgc(J->pt, ~(ptrdiff_t)rc))), 0);
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ J->rbchash[(rc & (RBCHASH_SLOTS-1))].ref = tref_ref(rc);
+ setmref(J->rbchash[(rc & (RBCHASH_SLOTS-1))].pc, pc);
+ setgcref(J->rbchash[(rc & (RBCHASH_SLOTS-1))].pt, obj2gco(J->pt));
+#endif
break;
/* -- Calls and vararg handling ----------------------------------------- */
@@ -2352,6 +2447,9 @@ void lj_record_setup(jit_State *J)
/* Initialize state related to current trace. */
memset(J->slot, 0, sizeof(J->slot));
memset(J->chain, 0, sizeof(J->chain));
+#ifdef LUAJIT_ENABLE_TABLE_BUMP
+ memset(J->rbchash, 0, sizeof(J->rbchash));
+#endif
memset(J->bpropcache, 0, sizeof(J->bpropcache));
J->scev.idx = REF_NIL;
setmref(J->scev.pc, NULL);
diff --git a/src/lj_snap.c b/src/lj_snap.c
index d8e7987..7c78f8a 100644
--- a/src/lj_snap.c
+++ b/src/lj_snap.c
@@ -26,9 +26,6 @@
#include "lj_cdata.h"
#endif
-/* Some local macros to save typing. Undef'd at the end. */
-#define IR(ref) (&J->cur.ir[(ref)])
-
/* Pass IR on to next optimization in chain (FOLD). */
#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
@@ -73,7 +70,7 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots)
IRRef ref = tref_ref(tr);
if (ref) {
SnapEntry sn = SNAP_TR(s, tr);
- IRIns *ir = IR(ref);
+ IRIns *ir = &J->cur.ir[ref];
if (!(sn & (SNAP_CONT|SNAP_FRAME)) &&
ir->o == IR_SLOAD && ir->op1 == s && ref > retf) {
/* No need to snapshot unmodified non-inherited slots. */
@@ -407,24 +404,24 @@ static TRef snap_pref(jit_State *J, GCtrace *T, SnapEntry *map, MSize nmax,
}
/* Check whether a sunk store corresponds to an allocation. Slow path. */
-static int snap_sunk_store2(jit_State *J, IRIns *ira, IRIns *irs)
+static int snap_sunk_store2(GCtrace *T, IRIns *ira, IRIns *irs)
{
if (irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
irs->o == IR_FSTORE || irs->o == IR_XSTORE) {
- IRIns *irk = IR(irs->op1);
+ IRIns *irk = &T->ir[irs->op1];
if (irk->o == IR_AREF || irk->o == IR_HREFK)
- irk = IR(irk->op1);
- return (IR(irk->op1) == ira);
+ irk = &T->ir[irk->op1];
+ return (&T->ir[irk->op1] == ira);
}
return 0;
}
/* Check whether a sunk store corresponds to an allocation. Fast path. */
-static LJ_AINLINE int snap_sunk_store(jit_State *J, IRIns *ira, IRIns *irs)
+static LJ_AINLINE int snap_sunk_store(GCtrace *T, IRIns *ira, IRIns *irs)
{
if (irs->s != 255)
return (ira + irs->s == irs); /* Fast check. */
- return snap_sunk_store2(J, ira, irs);
+ return snap_sunk_store2(T, ira, irs);
}
/* Replay snapshot state to setup side trace. */
@@ -487,7 +484,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
} else {
IRIns *irs;
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
if (snap_pref(J, T, map, nent, seen, irs->op2) == 0)
snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1);
else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) &&
@@ -521,13 +518,13 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
op2 = emitir_raw(IRT(IR_HIOP, IRT_I64), op2,
snap_pref(J, T, map, nent, seen, (ir+1)->op2));
}
- J->slot[snap_slot(sn)] = emitir(ir->ot, op1, op2);
+ J->slot[snap_slot(sn)] = emitir(ir->ot & ~(IRT_MARK|IRT_ISPHI), op1, op2);
} else {
IRIns *irs;
TRef tr = emitir(ir->ot, op1, op2);
J->slot[snap_slot(sn)] = tr;
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
IRIns *irr = &T->ir[irs->op1];
TRef val, key = irr->op2, tmp = tr;
if (irr->o != IR_FREF) {
@@ -631,8 +628,8 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex,
} else if (irt_isnum(t)) {
setnumV(o, ex->fpr[r-RID_MIN_FPR]);
#endif
- } else if (LJ_64 && irt_islightud(t)) {
- /* 64 bit lightuserdata which may escape already has the tag bits. */
+ } else if (LJ_64 && irt_is64(t)) {
+ /* 64 bit values that already have the tag bits. */
o->u64 = ex->gpr[r-RID_MIN_GPR];
} else if (irt_ispri(t)) {
setpriV(o, irt_toitype(t));
@@ -729,7 +726,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
} else {
IRIns *irs, *irlast = &T->ir[T->snap[snapno].ref];
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
IRIns *iro = &T->ir[T->ir[irs->op1].op2];
uint8_t *p = (uint8_t *)cd;
CTSize szs;
@@ -762,7 +759,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
settabV(J->L, o, t);
irlast = &T->ir[T->snap[snapno].ref];
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
IRIns *irk = &T->ir[irs->op1];
TValue tmp, *val;
lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
@@ -863,7 +860,6 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
return pc;
}
-#undef IR
#undef emitir_raw
#undef emitir
diff --git a/src/lj_strscan.c b/src/lj_strscan.c
index 568f647..d3c5ba9 100644
--- a/src/lj_strscan.c
+++ b/src/lj_strscan.c
@@ -140,7 +140,7 @@ static StrScanFmt strscan_hex(const uint8_t *p, TValue *o,
break;
}
- /* Reduce range then convert to double. */
+ /* Reduce range, then convert to double. */
if ((x & U64x(c0000000,0000000))) { x = (x >> 2) | (x & 3); ex2 += 2; }
strscan_double(x, o, ex2, neg);
return fmt;
@@ -326,6 +326,49 @@ static StrScanFmt strscan_dec(const uint8_t *p, TValue *o,
return fmt;
}
+/* Parse binary number. */
+static StrScanFmt strscan_bin(const uint8_t *p, TValue *o,
+ StrScanFmt fmt, uint32_t opt,
+ int32_t ex2, int32_t neg, uint32_t dig)
+{
+ uint64_t x = 0;
+ uint32_t i;
+
+ if (ex2 || dig > 64) return STRSCAN_ERROR;
+
+ /* Scan binary digits. */
+ for (i = dig; i; i--, p++) {
+ if ((*p & ~1) != '0') return STRSCAN_ERROR;
+ x = (x << 1) | (*p & 1);
+ }
+
+ /* Format-specific handling. */
+ switch (fmt) {
+ case STRSCAN_INT:
+ if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg) {
+ o->i = neg ? -(int32_t)x : (int32_t)x;
+ return STRSCAN_INT; /* Fast path for 32 bit integers. */
+ }
+ if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; break; }
+ /* fallthrough */
+ case STRSCAN_U32:
+ if (dig > 32) return STRSCAN_ERROR;
+ o->i = neg ? -(int32_t)x : (int32_t)x;
+ return STRSCAN_U32;
+ case STRSCAN_I64:
+ case STRSCAN_U64:
+ o->u64 = neg ? (uint64_t)-(int64_t)x : x;
+ return fmt;
+ default:
+ break;
+ }
+
+ /* Reduce range, then convert to double. */
+ if ((x & U64x(c0000000,0000000))) { x = (x >> 2) | (x & 3); ex2 += 2; }
+ strscan_double(x, o, ex2, neg);
+ return fmt;
+}
+
/* Scan string containing a number. Returns format. Returns value in o. */
StrScanFmt lj_strscan_scan(const uint8_t *p, TValue *o, uint32_t opt)
{
@@ -364,8 +407,12 @@ StrScanFmt lj_strscan_scan(const uint8_t *p, TValue *o, uint32_t opt)
/* Determine base and skip leading zeros. */
if (LJ_UNLIKELY(*p <= '0')) {
- if (*p == '0' && casecmp(p[1], 'x'))
- base = 16, cmask = LJ_CHAR_XDIGIT, p += 2;
+ if (*p == '0') {
+ if (casecmp(p[1], 'x'))
+ base = 16, cmask = LJ_CHAR_XDIGIT, p += 2;
+ else if (casecmp(p[1], 'b'))
+ base = 2, cmask = LJ_CHAR_DIGIT, p += 2;
+ }
for ( ; ; p++) {
if (*p == '0') {
hasdig = 1;
@@ -403,7 +450,7 @@ StrScanFmt lj_strscan_scan(const uint8_t *p, TValue *o, uint32_t opt)
}
/* Parse exponent. */
- if (casecmp(*p, (uint32_t)(base == 16 ? 'p' : 'e'))) {
+ if (base >= 10 && casecmp(*p, (uint32_t)(base == 16 ? 'p' : 'e'))) {
uint32_t xx;
int negx = 0;
fmt = STRSCAN_NUM; p++;
@@ -460,6 +507,8 @@ StrScanFmt lj_strscan_scan(const uint8_t *p, TValue *o, uint32_t opt)
return strscan_oct(sp, o, fmt, neg, dig);
if (base == 16)
fmt = strscan_hex(sp, o, fmt, opt, ex, neg, dig);
+ else if (base == 2)
+ fmt = strscan_bin(sp, o, fmt, opt, ex, neg, dig);
else
fmt = strscan_dec(sp, o, fmt, opt, ex, neg, dig);
diff --git a/src/lj_tab.c b/src/lj_tab.c
index a9f4383..88bf108 100644
--- a/src/lj_tab.c
+++ b/src/lj_tab.c
@@ -246,7 +246,7 @@ void LJ_FASTCALL lj_tab_free(global_State *g, GCtab *t)
/* -- Table resizing ------------------------------------------------------ */
/* Resize a table to fit the new array/hash part sizes. */
-static void resizetab(lua_State *L, GCtab *t, uint32_t asize, uint32_t hbits)
+void lj_tab_resize(lua_State *L, GCtab *t, uint32_t asize, uint32_t hbits)
{
Node *oldnode = noderef(t->node);
uint32_t oldasize = t->asize;
@@ -383,7 +383,7 @@ static void rehashtab(lua_State *L, GCtab *t, cTValue *ek)
asize += countint(ek, bins);
na = bestasize(bins, &asize);
total -= na;
- resizetab(L, t, asize, hsize2hbits(total));
+ lj_tab_resize(L, t, asize, hsize2hbits(total));
}
#if LJ_HASFFI
@@ -395,7 +395,7 @@ void lj_tab_rehash(lua_State *L, GCtab *t)
void lj_tab_reasize(lua_State *L, GCtab *t, uint32_t nasize)
{
- resizetab(L, t, nasize+1, t->hmask > 0 ? lj_fls(t->hmask)+1 : 0);
+ lj_tab_resize(L, t, nasize+1, t->hmask > 0 ? lj_fls(t->hmask)+1 : 0);
}
/* -- Table getters ------------------------------------------------------- */
diff --git a/src/lj_tab.h b/src/lj_tab.h
index 1da28bd..7cf031b 100644
--- a/src/lj_tab.h
+++ b/src/lj_tab.h
@@ -44,6 +44,7 @@ LJ_FUNC void LJ_FASTCALL lj_tab_free(global_State *g, GCtab *t);
#if LJ_HASFFI
LJ_FUNC void lj_tab_rehash(lua_State *L, GCtab *t);
#endif
+LJ_FUNC void lj_tab_resize(lua_State *L, GCtab *t, uint32_t asize, uint32_t hbits);
LJ_FUNCA void lj_tab_reasize(lua_State *L, GCtab *t, uint32_t nasize);
/* Caveat: all getters except lj_tab_get() can return NULL! */
diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h
index 65e438f..fc9d370 100644
--- a/src/lj_target_x86.h
+++ b/src/lj_target_x86.h
@@ -132,7 +132,11 @@ enum {
#define SPS_FIXED (4*2)
#define SPS_FIRST (4*2) /* Don't use callee register save area. */
#else
+#if LJ_GC64
+#define SPS_FIXED 2
+#else
#define SPS_FIXED 4
+#endif
#define SPS_FIRST 2
#endif
#else
diff --git a/src/lj_trace.c b/src/lj_trace.c
index 39ff046..1d0c2e5 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -117,15 +117,22 @@ static void perftools_addtrace(GCtrace *T)
}
#endif
-/* Save current trace by copying and compacting it. */
-static void trace_save(jit_State *J)
+/* Allocate space for copy of trace. */
+static GCtrace *trace_save_alloc(jit_State *J)
{
size_t sztr = ((sizeof(GCtrace)+7)&~7);
size_t szins = (J->cur.nins-J->cur.nk)*sizeof(IRIns);
size_t sz = sztr + szins +
J->cur.nsnap*sizeof(SnapShot) +
J->cur.nsnapmap*sizeof(SnapEntry);
- GCtrace *T = lj_mem_newt(J->L, (MSize)sz, GCtrace);
+ return lj_mem_newt(J->L, (MSize)sz, GCtrace);
+}
+
+/* Save current trace by copying and compacting it. */
+static void trace_save(jit_State *J, GCtrace *T)
+{
+ size_t sztr = ((sizeof(GCtrace)+7)&~7);
+ size_t szins = (J->cur.nins-J->cur.nk)*sizeof(IRIns);
char *p = (char *)T + sztr;
memcpy(T, &J->cur, sizeof(GCtrace));
setgcrefr(T->nextgc, J2G(J)->gc.root);
@@ -267,7 +274,7 @@ int lj_trace_flushall(lua_State *L)
if (T->root == 0)
trace_flushroot(J, T);
lj_gdbjit_deltrace(J, T);
- T->traceno = 0;
+ T->traceno = T->link = 0; /* Blacklist the link for cont_stitch. */
setgcrefnull(J->trace[i]);
}
}
@@ -277,6 +284,7 @@ int lj_trace_flushall(lua_State *L)
memset(J->penalty, 0, sizeof(J->penalty));
/* Free the whole machine code and invalidate all exit stub groups. */
lj_mcode_free(J);
+ lj_ir_k64_freeall(J);
memset(J->exitstubgroup, 0, sizeof(J->exitstubgroup));
lj_vmevent_send(L, TRACE,
setstrV(L, L->top++, lj_str_newlit(L, "flush"));
@@ -394,6 +402,8 @@ static void trace_start(jit_State *J)
J->guardemit.irt = 0;
J->postproc = LJ_POST_NONE;
lj_resetsplit(J);
+ J->retryrec = 0;
+ J->ktracep = NULL;
setgcref(J->cur.startpt, obj2gco(J->pt));
L = J->L;
@@ -417,6 +427,7 @@ static void trace_stop(jit_State *J)
BCOp op = bc_op(J->cur.startins);
GCproto *pt = &gcref(J->cur.startpt)->pt;
TraceNo traceno = J->cur.traceno;
+ GCtrace *T = trace_save_alloc(J); /* Do this first. May throw OOM. */
lua_State *L;
switch (op) {
@@ -467,7 +478,10 @@ static void trace_stop(jit_State *J)
/* Commit new mcode only after all patching is done. */
lj_mcode_commit(J, J->cur.mcode);
J->postproc = LJ_POST_NONE;
- trace_save(J);
+ trace_save(J, T);
+ if (J->ktracep) { /* Patch K64Array slot with the final GCtrace pointer. */
+ setgcV(J->L, J->ktracep, obj2gco(T), LJ_TTRACE);
+ }
L = J->L;
lj_vmevent_send(L, TRACE,
@@ -510,10 +524,15 @@ static int trace_abort(jit_State *J)
}
/* Penalize or blacklist starting bytecode instruction. */
if (J->parent == 0 && !bc_isret(bc_op(J->cur.startins))) {
- if (J->exitno == 0)
- penalty_pc(J, &gcref(J->cur.startpt)->pt, mref(J->cur.startpc, BCIns), e);
- else
+ if (J->exitno == 0) {
+ BCIns *startpc = mref(J->cur.startpc, BCIns);
+ if (e == LJ_TRERR_RETRY)
+ hotcount_set(J2GG(J), startpc+1, 1); /* Immediate retry. */
+ else
+ penalty_pc(J, &gcref(J->cur.startpt)->pt, startpc, e);
+ } else {
traceref(J, J->exitno)->link = J->exitno; /* Self-link is blacklisted. */
+ }
}
/* Is there anything to abort? */
diff --git a/src/lj_traceerr.h b/src/lj_traceerr.h
index 6b377cb..d434be1 100644
--- a/src/lj_traceerr.h
+++ b/src/lj_traceerr.h
@@ -12,6 +12,7 @@ TREDEF(TRACEOV, "trace too long")
TREDEF(STACKOV, "trace too deep")
TREDEF(SNAPOV, "too many snapshots")
TREDEF(BLACKL, "blacklisted")
+TREDEF(RETRY, "retry recording")
TREDEF(NYIBC, "NYI: bytecode %d")
/* Recording loop ops. */
diff --git a/src/lua.h b/src/lua.h
index c83fd3b..352d29f 100644
--- a/src/lua.h
+++ b/src/lua.h
@@ -226,6 +226,7 @@ LUA_API int (lua_status) (lua_State *L);
#define LUA_GCSTEP 5
#define LUA_GCSETPAUSE 6
#define LUA_GCSETSTEPMUL 7
+#define LUA_GCISRUNNING 9
LUA_API int (lua_gc) (lua_State *L, int what, int data);
diff --git a/src/luaconf.h b/src/luaconf.h
index 043590b..79f5148 100644
--- a/src/luaconf.h
+++ b/src/luaconf.h
@@ -37,7 +37,7 @@
#endif
#define LUA_LROOT "/usr/local"
#define LUA_LUADIR "/lua/5.1/"
-#define LUA_LJDIR "/luajit-2.1.0-alpha/"
+#define LUA_LJDIR "/luajit-2.1.0-beta1/"
#ifdef LUA_ROOT
#define LUA_JROOT LUA_ROOT
diff --git a/src/luajit.h b/src/luajit.h
index 3db4bba..9604185 100644
--- a/src/luajit.h
+++ b/src/luajit.h
@@ -30,9 +30,9 @@
#include "lua.h"
-#define LUAJIT_VERSION "LuaJIT 2.1.0-alpha"
+#define LUAJIT_VERSION "LuaJIT 2.1.0-beta1"
#define LUAJIT_VERSION_NUM 20100 /* Version 2.1.0 = 02.01.00. */
-#define LUAJIT_VERSION_SYM luaJIT_version_2_1_0_alpha
+#define LUAJIT_VERSION_SYM luaJIT_version_2_1_0_beta1
#define LUAJIT_COPYRIGHT "Copyright (C) 2005-2015 Mike Pall"
#define LUAJIT_URL "http://luajit.org/"
diff --git a/src/ps4build.bat b/src/ps4build.bat
index 42fc9a6..337a44f 100644
--- a/src/ps4build.bat
+++ b/src/ps4build.bat
@@ -27,11 +27,11 @@ if exist minilua.exe.manifest^
@minilua
@if not errorlevel 8 goto :FAIL
-@set DASMFLAGS=-D P64
+@set DASMFLAGS=-D P64 -D NO_UNWIND
minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x86.dasc
@if errorlevel 1 goto :BAD
-%LJCOMPILE% /I "." /I %DASMDIR% -DLUAJIT_TARGET=LUAJIT_ARCH_X64 -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI host\buildvm*.c
+%LJCOMPILE% /I "." /I %DASMDIR% -DLUAJIT_TARGET=LUAJIT_ARCH_X64 -DLUAJIT_OS=LUAJIT_OS_OTHER -DLUAJIT_DISABLE_JIT -DLUAJIT_DISABLE_FFI -DLUAJIT_NO_UNWIND host\buildvm*.c
@if errorlevel 1 goto :BAD
%LJLINK% /out:buildvm.exe buildvm*.obj
@if errorlevel 1 goto :BAD
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc
index 0bd9b14..af722f9 100644
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@@ -2086,7 +2086,7 @@ static void build_subroutines(BuildCtx *ctx)
| // RA = resultptr, CARG4 = meta base
| ldr RB, SAVE_MULTRES
| ldr INS, [PC, #-4]
- | ldr CARG3, [CARG4, #-24] // Save previous trace number.
+ | ldr TRACE:CARG3, [CARG4, #-24] // Save previous trace.
| subs RB, RB, #8
| decode_RA8 RC, INS // Call base.
| beq >2
@@ -2101,23 +2101,20 @@ static void build_subroutines(BuildCtx *ctx)
| decode_RA8 RA, INS
| decode_RB8 RB, INS
| add RA, RA, RB
- | ldr CARG1, [DISPATCH, #DISPATCH_J(trace)]
|3:
| cmp RA, RC
| mvn CARG2, #~LJ_TNIL
| bhi >9 // More results wanted?
|
- | ldr TRACE:RA, [CARG1, CARG3, lsl #2]
- | cmp TRACE:RA, #0
- | beq ->cont_nop
- | ldrh RC, TRACE:RA->link
- | cmp RC, CARG3
+ | ldrh RA, TRACE:CARG3->traceno
+ | ldrh RC, TRACE:CARG3->link
+ | cmp RC, RA
| beq ->cont_nop // Blacklisted.
| cmp RC, #0
| bne =>BC_JLOOP // Jump to stitched trace.
|
| // Stitch a new trace to the previous trace.
- | str CARG3, [DISPATCH, #DISPATCH_J(exitno)]
+ | str RA, [DISPATCH, #DISPATCH_J(exitno)]
| str L, [DISPATCH, #DISPATCH_J(L)]
| str BASE, L->base
| sub CARG1, DISPATCH, #-GG_DISP2J
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index a31cbb3..f1251f2 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -853,10 +853,10 @@ static void build_subroutines(BuildCtx *ctx)
| str PC, SAVE_PC
| add CARG3, RA, NARGS8:RC
| bl extern lj_meta_call // (lua_State *L, TValue *func, TValue *top)
- | ldr LFUNC:CARG3, [RA, FRAME_FUNC] // Guaranteed to be a function here.
+ | ldr TMP1, [RA, FRAME_FUNC] // Guaranteed to be a function here.
| ldr PC, [BASE, FRAME_PC]
| add NARGS8:RC, NARGS8:RC, #8 // Got one more argument now.
- | and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+ | and LFUNC:CARG3, TMP1, #LJ_GCVMASK
| b ->BC_CALLT2_Z
|
|//-- Argument coercion for 'for' statement ------------------------------
@@ -1309,6 +1309,7 @@ static void build_subroutines(BuildCtx *ctx)
| blo ->fff_fallback
| cmp TISNUMhi, CARG1, lsr #32
| beq ->fff_restv
+ | blo ->fff_fallback
| round d0, d0
| b ->fff_resn
|.endmacro
diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc
index 7cfdf4b..134ed56 100644
--- a/src/vm_mips.dasc
+++ b/src/vm_mips.dasc
@@ -2015,7 +2015,7 @@ static void build_subroutines(BuildCtx *ctx)
|.if JIT
| // RA = resultptr, RB = meta base
| lw INS, -4(PC)
- | lw TMP3, -24+LO(RB) // Save previous trace number.
+ | lw TMP2, -24+LO(RB) // Save previous trace.
| decode_RA8a RC, INS
| addiu AT, MULTRES, -8
| decode_RA8b RC
@@ -2034,17 +2034,13 @@ static void build_subroutines(BuildCtx *ctx)
| decode_RA8b RA
| decode_RB8b RB
| addu RA, RA, RB
- | lw TMP1, DISPATCH_J(trace)(DISPATCH)
| addu RA, BASE, RA
|3:
| sltu AT, RC, RA
| bnez AT, >9 // More results wanted?
- |. sll TMP2, TMP3, 2
+ |. nop
|
- | addu TMP2, TMP1, TMP2
- | lw TRACE:TMP2, 0(TMP2)
- | beqz TRACE:TMP2, ->cont_nop
- |. nop
+ | lhu TMP3, TRACE:TMP2->traceno
| lhu RD, TRACE:TMP2->link
| beq RD, TMP3, ->cont_nop // Blacklisted.
|. load_got lj_dispatch_stitch
diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc
index 2a7a745..0d6915f 100644
--- a/src/vm_ppc.dasc
+++ b/src/vm_ppc.dasc
@@ -320,13 +320,6 @@
|
|//-----------------------------------------------------------------------
|
-|// These basic macros should really be part of DynASM.
-|.macro srwi, rx, ry, n; rlwinm rx, ry, 32-n, n, 31; .endmacro
-|.macro slwi, rx, ry, n; rlwinm rx, ry, n, 0, 31-n; .endmacro
-|.macro rotlwi, rx, ry, n; rlwinm rx, ry, n, 0, 31; .endmacro
-|.macro rotlw, rx, ry, rn; rlwnm rx, ry, rn, 0, 31; .endmacro
-|.macro subi, rx, ry, i; addi rx, ry, -i; .endmacro
-|
|// Trap for not-yet-implemented parts.
|.macro NYI; tw 4, sp, sp; .endmacro
|
@@ -2532,7 +2525,7 @@ static void build_subroutines(BuildCtx *ctx)
|.if JIT
| // RA = resultptr, RB = meta base
| lwz INS, -4(PC)
- | lwz TMP3, -20(RB) // Save previous trace number.
+ | lwz TRACE:TMP2, -20(RB) // Save previous trace.
| addic. TMP1, MULTRES, -8
| decode_RA8 RC, INS // Call base.
| beq >2
@@ -2547,15 +2540,11 @@ static void build_subroutines(BuildCtx *ctx)
| decode_RA8 RA, INS
| decode_RB8 RB, INS
| add RA, RA, RB
- | lwz TMP1, DISPATCH_J(trace)(DISPATCH)
|3:
| cmplw RA, RC
| bgt >9 // More results wanted?
|
- | slwi TMP2, TMP3, 2
- | lwzx TRACE:TMP2, TMP1, TMP2
- | cmpwi TRACE:TMP2, 0
- | beq ->cont_nop
+ | lhz TMP3, TRACE:TMP2->traceno
| lhz RD, TRACE:TMP2->link
| cmpw RD, TMP3
| cmpwi cr1, RD, 0
@@ -4289,13 +4278,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lwz TAB:CARG2, 4(RB)
|.if DUALNUM
| add RC, BASE, RC
- | lbz TMP3, TAB:RB->marked
+ | lbz TMP3, TAB:CARG2->marked
| lwz TMP0, TAB:CARG2->asize
| lwz CARG3, 4(RC)
| lwz TMP1, TAB:CARG2->array
|.else
| lfdx f0, BASE, RC
- | lbz TMP3, TAB:RB->marked
+ | lbz TMP3, TAB:CARG2->marked
| lwz TMP0, TAB:CARG2->asize
| toint CARG3, f0
| lwz TMP1, TAB:CARG2->array
diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc
new file mode 100644
index 0000000..bba89aa
--- /dev/null
+++ b/src/vm_x64.dasc
@@ -0,0 +1,4902 @@
+|// Low-level VM code for x64 CPUs in LJ_GC64 mode.
+|// Bytecode interpreter, fast functions and helper functions.
+|// Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
+|
+|.arch x64
+|.section code_op, code_sub
+|
+|.actionlist build_actionlist
+|.globals GLOB_
+|.globalnames globnames
+|.externnames extnames
+|
+|//-----------------------------------------------------------------------
+|
+|.if WIN
+|.define X64WIN, 1 // Windows/x64 calling conventions.
+|.endif
+|
+|// Fixed register assignments for the interpreter.
+|// This is very fragile and has many dependencies. Caveat emptor.
+|.define BASE, rdx // Not C callee-save, refetched anyway.
+|.if X64WIN
+|.define KBASE, rdi // Must be C callee-save.
+|.define PC, rsi // Must be C callee-save.
+|.define DISPATCH, rbx // Must be C callee-save.
+|.define KBASEd, edi
+|.define PCd, esi
+|.define DISPATCHd, ebx
+|.else
+|.define KBASE, r15 // Must be C callee-save.
+|.define PC, rbx // Must be C callee-save.
+|.define DISPATCH, r14 // Must be C callee-save.
+|.define KBASEd, r15d
+|.define PCd, ebx
+|.define DISPATCHd, r14d
+|.endif
+|
+|.define RA, rcx
+|.define RAd, ecx
+|.define RAH, ch
+|.define RAL, cl
+|.define RB, rbp // Must be rbp (C callee-save).
+|.define RBd, ebp
+|.define RC, rax // Must be rax.
+|.define RCd, eax
+|.define RCW, ax
+|.define RCH, ah
+|.define RCL, al
+|.define OP, RBd
+|.define RD, RC
+|.define RDd, RCd
+|.define RDW, RCW
+|.define RDL, RCL
+|.define TMPR, r10
+|.define TMPRd, r10d
+|.define ITYPE, r11
+|.define ITYPEd, r11d
+|
+|.if X64WIN
+|.define CARG1, rcx // x64/WIN64 C call arguments.
+|.define CARG2, rdx
+|.define CARG3, r8
+|.define CARG4, r9
+|.define CARG1d, ecx
+|.define CARG2d, edx
+|.define CARG3d, r8d
+|.define CARG4d, r9d
+|.else
+|.define CARG1, rdi // x64/POSIX C call arguments.
+|.define CARG2, rsi
+|.define CARG3, rdx
+|.define CARG4, rcx
+|.define CARG5, r8
+|.define CARG6, r9
+|.define CARG1d, edi
+|.define CARG2d, esi
+|.define CARG3d, edx
+|.define CARG4d, ecx
+|.define CARG5d, r8d
+|.define CARG6d, r9d
+|.endif
+|
+|// Type definitions. Some of these are only used for documentation.
+|.type L, lua_State
+|.type GL, global_State
+|.type TVALUE, TValue
+|.type GCOBJ, GCobj
+|.type STR, GCstr
+|.type TAB, GCtab
+|.type LFUNC, GCfuncL
+|.type CFUNC, GCfuncC
+|.type PROTO, GCproto
+|.type UPVAL, GCupval
+|.type NODE, Node
+|.type NARGS, int
+|.type TRACE, GCtrace
+|.type SBUF, SBuf
+|
+|// Stack layout while in interpreter. Must match with lj_frame.h.
+|//-----------------------------------------------------------------------
+|.if X64WIN // x64/Windows stack layout
+|
+|.define CFRAME_SPACE, aword*5 // Delta for rsp (see <--).
+|.macro saveregs_
+| push rdi; push rsi; push rbx
+| sub rsp, CFRAME_SPACE
+|.endmacro
+|.macro saveregs
+| push rbp; saveregs_
+|.endmacro
+|.macro restoreregs
+| add rsp, CFRAME_SPACE
+| pop rbx; pop rsi; pop rdi; pop rbp
+|.endmacro
+|
+|.define SAVE_CFRAME, aword [rsp+aword*13]
+|.define SAVE_PC, aword [rsp+aword*12]
+|.define SAVE_L, aword [rsp+aword*11]
+|.define SAVE_ERRF, dword [rsp+dword*21]
+|.define SAVE_NRES, dword [rsp+dword*20]
+|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by interpreter
+|.define SAVE_RET, aword [rsp+aword*9] //<-- rsp entering interpreter.
+|.define SAVE_R4, aword [rsp+aword*8]
+|.define SAVE_R3, aword [rsp+aword*7]
+|.define SAVE_R2, aword [rsp+aword*6]
+|.define SAVE_R1, aword [rsp+aword*5] //<-- rsp after register saves.
+|.define ARG5, aword [rsp+aword*4]
+|.define CSAVE_4, aword [rsp+aword*3]
+|.define CSAVE_3, aword [rsp+aword*2]
+|.define CSAVE_2, aword [rsp+aword*1]
+|.define CSAVE_1, aword [rsp] //<-- rsp while in interpreter.
+|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by callee
+|
+|.define ARG5d, dword [rsp+dword*8]
+|.define TMP1, ARG5 // TMP1 overlaps ARG5
+|.define TMP1d, ARG5d
+|.define TMP1hi, dword [rsp+dword*9]
+|.define MULTRES, TMP1d // MULTRES overlaps TMP1d.
+|
+|//-----------------------------------------------------------------------
+|.else // x64/POSIX stack layout
+|
+|.define CFRAME_SPACE, aword*5 // Delta for rsp (see <--).
+|.macro saveregs_
+| push rbx; push r15; push r14
+|.if NO_UNWIND
+| push r13; push r12
+|.endif
+| sub rsp, CFRAME_SPACE
+|.endmacro
+|.macro saveregs
+| push rbp; saveregs_
+|.endmacro
+|.macro restoreregs
+| add rsp, CFRAME_SPACE
+|.if NO_UNWIND
+| pop r12; pop r13
+|.endif
+| pop r14; pop r15; pop rbx; pop rbp
+|.endmacro
+|
+|//----- 16 byte aligned,
+|.if NO_UNWIND
+|.define SAVE_RET, aword [rsp+aword*11] //<-- rsp entering interpreter.
+|.define SAVE_R4, aword [rsp+aword*10]
+|.define SAVE_R3, aword [rsp+aword*9]
+|.define SAVE_R2, aword [rsp+aword*8]
+|.define SAVE_R1, aword [rsp+aword*7]
+|.define SAVE_RU2, aword [rsp+aword*6]
+|.define SAVE_RU1, aword [rsp+aword*5] //<-- rsp after register saves.
+|.else
+|.define SAVE_RET, aword [rsp+aword*9] //<-- rsp entering interpreter.
+|.define SAVE_R4, aword [rsp+aword*8]
+|.define SAVE_R3, aword [rsp+aword*7]
+|.define SAVE_R2, aword [rsp+aword*6]
+|.define SAVE_R1, aword [rsp+aword*5] //<-- rsp after register saves.
+|.endif
+|.define SAVE_CFRAME, aword [rsp+aword*4]
+|.define SAVE_PC, aword [rsp+aword*3]
+|.define SAVE_L, aword [rsp+aword*2]
+|.define SAVE_ERRF, dword [rsp+dword*3]
+|.define SAVE_NRES, dword [rsp+dword*2]
+|.define TMP1, aword [rsp] //<-- rsp while in interpreter.
+|//----- 16 byte aligned
+|
+|.define TMP1d, dword [rsp]
+|.define TMP1hi, dword [rsp+dword*1]
+|.define MULTRES, TMP1d // MULTRES overlaps TMP1d.
+|
+|.endif
+|
+|//-----------------------------------------------------------------------
+|
+|// Instruction headers.
+|.macro ins_A; .endmacro
+|.macro ins_AD; .endmacro
+|.macro ins_AJ; .endmacro
+|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro
+|.macro ins_AB_; movzx RBd, RCH; .endmacro
+|.macro ins_A_C; movzx RCd, RCL; .endmacro
+|.macro ins_AND; not RD; .endmacro
+|
+|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
+|.macro ins_NEXT
+| mov RCd, [PC]
+| movzx RAd, RCH
+| movzx OP, RCL
+| add PC, 4
+| shr RCd, 16
+| jmp aword [DISPATCH+OP*8]
+|.endmacro
+|
+|// Instruction footer.
+|.if 1
+| // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
+| .define ins_next, ins_NEXT
+| .define ins_next_, ins_NEXT
+|.else
+| // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
+| // Affects only certain kinds of benchmarks (and only with -j off).
+| // Around 10%-30% slower on Core2, a lot more slower on P4.
+| .macro ins_next
+| jmp ->ins_next
+| .endmacro
+| .macro ins_next_
+| ->ins_next:
+| ins_NEXT
+| .endmacro
+|.endif
+|
+|// Call decode and dispatch.
+|.macro ins_callt
+| // BASE = new base, RB = LFUNC, RD = nargs+1, [BASE-8] = PC
+| mov PC, LFUNC:RB->pc
+| mov RAd, [PC]
+| movzx OP, RAL
+| movzx RAd, RAH
+| add PC, 4
+| jmp aword [DISPATCH+OP*8]
+|.endmacro
+|
+|.macro ins_call
+| // BASE = new base, RB = LFUNC, RD = nargs+1
+| mov [BASE-8], PC
+| ins_callt
+|.endmacro
+|
+|//-----------------------------------------------------------------------
+|
+|// Macros to clear or set tags.
+|.macro cleartp, reg; shl reg, 17; shr reg, 17; .endmacro
+|.macro settp, reg, tp
+| mov64 ITYPE, ((int64_t)tp<<47)
+| or reg, ITYPE
+|.endmacro
+|.macro settp, dst, reg, tp
+| mov64 dst, ((int64_t)tp<<47)
+| or dst, reg
+|.endmacro
+|.macro setint, reg
+| settp reg, LJ_TISNUM
+|.endmacro
+|.macro setint, dst, reg
+| settp dst, reg, LJ_TISNUM
+|.endmacro
+|
+|// Macros to test operand types.
+|.macro checktp_nc, reg, tp, target
+| mov ITYPE, reg
+| sar ITYPE, 47
+| cmp ITYPEd, tp
+| jne target
+|.endmacro
+|.macro checktp, reg, tp, target
+| mov ITYPE, reg
+| cleartp reg
+| sar ITYPE, 47
+| cmp ITYPEd, tp
+| jne target
+|.endmacro
+|.macro checktptp, src, tp, target
+| mov ITYPE, src
+| sar ITYPE, 47
+| cmp ITYPEd, tp
+| jne target
+|.endmacro
+|.macro checkstr, reg, target; checktp reg, LJ_TSTR, target; .endmacro
+|.macro checktab, reg, target; checktp reg, LJ_TTAB, target; .endmacro
+|.macro checkfunc, reg, target; checktp reg, LJ_TFUNC, target; .endmacro
+|
+|.macro checknumx, reg, target, jump
+| mov ITYPE, reg
+| sar ITYPE, 47
+| cmp ITYPEd, LJ_TISNUM
+| jump target
+|.endmacro
+|.macro checkint, reg, target; checknumx reg, target, jne; .endmacro
+|.macro checkinttp, src, target; checknumx src, target, jne; .endmacro
+|.macro checknum, reg, target; checknumx reg, target, jae; .endmacro
+|.macro checknumtp, src, target; checknumx src, target, jae; .endmacro
+|.macro checknumber, src, target; checknumx src, target, ja; .endmacro
+|
+|.macro mov_false, reg; mov64 reg, (int64_t)~((uint64_t)1<<47); .endmacro
+|.macro mov_true, reg; mov64 reg, (int64_t)~((uint64_t)2<<47); .endmacro
+|
+|// These operands must be used with movzx.
+|.define PC_OP, byte [PC-4]
+|.define PC_RA, byte [PC-3]
+|.define PC_RB, byte [PC-1]
+|.define PC_RC, byte [PC-2]
+|.define PC_RD, word [PC-2]
+|
+|.macro branchPC, reg
+| lea PC, [PC+reg*4-BCBIAS_J*4]
+|.endmacro
+|
+|// Assumes DISPATCH is relative to GL.
+#define DISPATCH_GL(field) (GG_DISP2G + (int)offsetof(global_State, field))
+#define DISPATCH_J(field) (GG_DISP2J + (int)offsetof(jit_State, field))
+|
+#define PC2PROTO(field) ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
+|
+|// Decrement hashed hotcount and trigger trace recorder if zero.
+|.macro hotloop, reg
+| mov reg, PCd
+| shr reg, 1
+| and reg, HOTCOUNT_PCMASK
+| sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
+| jb ->vm_hotloop
+|.endmacro
+|
+|.macro hotcall, reg
+| mov reg, PCd
+| shr reg, 1
+| and reg, HOTCOUNT_PCMASK
+| sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_CALL
+| jb ->vm_hotcall
+|.endmacro
+|
+|// Set current VM state.
+|.macro set_vmstate, st
+| mov dword [DISPATCH+DISPATCH_GL(vmstate)], ~LJ_VMST_..st
+|.endmacro
+|
+|.macro fpop1; fstp st1; .endmacro
+|
+|// Synthesize SSE FP constants.
+|.macro sseconst_abs, reg, tmp // Synthesize abs mask.
+| mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
+|.endmacro
+|
+|.macro sseconst_hi, reg, tmp, val // Synthesize hi-32 bit const.
+| mov64 tmp, U64x(val,00000000); movd reg, tmp
+|.endmacro
+|
+|.macro sseconst_sign, reg, tmp // Synthesize sign mask.
+| sseconst_hi reg, tmp, 80000000
+|.endmacro
+|.macro sseconst_1, reg, tmp // Synthesize 1.0.
+| sseconst_hi reg, tmp, 3ff00000
+|.endmacro
+|.macro sseconst_m1, reg, tmp // Synthesize -1.0.
+| sseconst_hi reg, tmp, bff00000
+|.endmacro
+|.macro sseconst_2p52, reg, tmp // Synthesize 2^52.
+| sseconst_hi reg, tmp, 43300000
+|.endmacro
+|.macro sseconst_tobit, reg, tmp // Synthesize 2^52 + 2^51.
+| sseconst_hi reg, tmp, 43380000
+|.endmacro
+|
+|// Move table write barrier back. Overwrites reg.
+|.macro barrierback, tab, reg
+| and byte tab->marked, (uint8_t)~LJ_GC_BLACK // black2gray(tab)
+| mov reg, [DISPATCH+DISPATCH_GL(gc.grayagain)]
+| mov [DISPATCH+DISPATCH_GL(gc.grayagain)], tab
+| mov tab->gclist, reg
+|.endmacro
+|
+|//-----------------------------------------------------------------------
+
+/* Generate subroutines used by opcodes and other parts of the VM. */
+/* The .code_sub section should be last to help static branch prediction. */
+static void build_subroutines(BuildCtx *ctx)
+{
+ |.code_sub
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Return handling ----------------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |->vm_returnp:
+ | test PCd, FRAME_P
+ | jz ->cont_dispatch
+ |
+ | // Return from pcall or xpcall fast func.
+ | and PC, -8
+ | sub BASE, PC // Restore caller base.
+ | lea RA, [RA+PC-8] // Rebase RA and prepend one result.
+ | mov PC, [BASE-8] // Fetch PC of previous frame.
+ | // Prepending may overwrite the pcall frame, so do it at the end.
+ | mov_true ITYPE
+ | mov aword [BASE+RA], ITYPE // Prepend true to results.
+ |
+ |->vm_returnc:
+ | add RDd, 1 // RD = nresults+1
+ | jz ->vm_unwind_yield
+ | mov MULTRES, RDd
+ | test PC, FRAME_TYPE
+ | jz ->BC_RET_Z // Handle regular return to Lua.
+ |
+ |->vm_return:
+ | // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return
+ | xor PC, FRAME_C
+ | test PCd, FRAME_TYPE
+ | jnz ->vm_returnp
+ |
+ | // Return to C.
+ | set_vmstate C
+ | and PC, -8
+ | sub PC, BASE
+ | neg PC // Previous base = BASE - delta.
+ |
+ | sub RDd, 1
+ | jz >2
+ |1: // Move results down.
+ | mov RB, [BASE+RA]
+ | mov [BASE-16], RB
+ | add BASE, 8
+ | sub RDd, 1
+ | jnz <1
+ |2:
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, PC
+ |3:
+ | mov RDd, MULTRES
+ | mov RAd, SAVE_NRES // RA = wanted nresults+1
+ |4:
+ | cmp RAd, RDd
+ | jne >6 // More/less results wanted?
+ |5:
+ | sub BASE, 16
+ | mov L:RB->top, BASE
+ |
+ |->vm_leave_cp:
+ | mov RA, SAVE_CFRAME // Restore previous C frame.
+ | mov L:RB->cframe, RA
+ | xor eax, eax // Ok return status for vm_pcall.
+ |
+ |->vm_leave_unw:
+ | restoreregs
+ | ret
+ |
+ |6:
+ | jb >7 // Less results wanted?
+ | // More results wanted. Check stack size and fill up results with nil.
+ | cmp BASE, L:RB->maxstack
+ | ja >8
+ | mov aword [BASE-16], LJ_TNIL
+ | add BASE, 8
+ | add RDd, 1
+ | jmp <4
+ |
+ |7: // Less results wanted.
+ | test RAd, RAd
+ | jz <5 // But check for LUA_MULTRET+1.
+ | sub RA, RD // Negative result!
+ | lea BASE, [BASE+RA*8] // Correct top.
+ | jmp <5
+ |
+ |8: // Corner case: need to grow stack for filling up results.
+ | // This can happen if:
+ | // - A C function grows the stack (a lot).
+ | // - The GC shrinks the stack in between.
+ | // - A return back from a lua_call() with (high) nresults adjustment.
+ | mov L:RB->top, BASE // Save current top held in BASE (yes).
+ | mov MULTRES, RDd // Need to fill only remainder with nil.
+ | mov CARG2d, RAd
+ | mov CARG1, L:RB
+ | call extern lj_state_growstack // (lua_State *L, int n)
+ | mov BASE, L:RB->top // Need the (realloced) L->top in BASE.
+ | jmp <3
+ |
+ |->vm_unwind_yield:
+ | mov al, LUA_YIELD
+ | jmp ->vm_unwind_c_eh
+ |
+ |->vm_unwind_c: // Unwind C stack, return from vm_pcall.
+ | // (void *cframe, int errcode)
+ | mov eax, CARG2d // Error return status for vm_pcall.
+ | mov rsp, CARG1
+ |->vm_unwind_c_eh: // Landing pad for external unwinder.
+ | mov L:RB, SAVE_L
+ | mov GL:RB, L:RB->glref
+ | mov dword GL:RB->vmstate, ~LJ_VMST_C
+ | jmp ->vm_leave_unw
+ |
+ |->vm_unwind_rethrow:
+ |.if not X64WIN
+ | mov CARG1, SAVE_L
+ | mov CARG2d, eax
+ | restoreregs
+ | jmp extern lj_err_throw // (lua_State *L, int errcode)
+ |.endif
+ |
+ |->vm_unwind_ff: // Unwind C stack, return from ff pcall.
+ | // (void *cframe)
+ | and CARG1, CFRAME_RAWMASK
+ | mov rsp, CARG1
+ |->vm_unwind_ff_eh: // Landing pad for external unwinder.
+ | mov L:RB, SAVE_L
+ | mov RDd, 1+1 // Really 1+2 results, incr. later.
+ | mov BASE, L:RB->base
+ | mov DISPATCH, L:RB->glref // Setup pointer to dispatch table.
+ | add DISPATCH, GG_G2DISP
+ | mov PC, [BASE-8] // Fetch PC of previous frame.
+ | mov_false RA
+ | mov RB, [BASE]
+ | mov [BASE-16], RA // Prepend false to error message.
+ | mov [BASE-8], RB
+ | mov RA, -16 // Results start at BASE+RA = BASE-16.
+ | set_vmstate INTERP
+ | jmp ->vm_returnc // Increments RD/MULTRES and returns.
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Grow stack for calls -----------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |->vm_growstack_c: // Grow stack for C function.
+ | mov CARG2d, LUA_MINSTACK
+ | jmp >2
+ |
+ |->vm_growstack_v: // Grow stack for vararg Lua function.
+ | sub RD, 16 // LJ_FR2
+ | jmp >1
+ |
+ |->vm_growstack_f: // Grow stack for fixarg Lua function.
+ | // BASE = new base, RD = nargs+1, RB = L, PC = first PC
+ | lea RD, [BASE+NARGS:RD*8-8]
+ |1:
+ | movzx RAd, byte [PC-4+PC2PROTO(framesize)]
+ | add PC, 4 // Must point after first instruction.
+ | mov L:RB->base, BASE
+ | mov L:RB->top, RD
+ | mov SAVE_PC, PC
+ | mov CARG2, RA
+ |2:
+ | // RB = L, L->base = new base, L->top = top
+ | mov CARG1, L:RB
+ | call extern lj_state_growstack // (lua_State *L, int n)
+ | mov BASE, L:RB->base
+ | mov RD, L:RB->top
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | sub RD, BASE
+ | shr RDd, 3
+ | add NARGS:RDd, 1
+ | // BASE = new base, RB = LFUNC, RD = nargs+1
+ | ins_callt // Just retry the call.
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Entry points into the assembler VM ---------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |->vm_resume: // Setup C frame and resume thread.
+ | // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
+ | saveregs
+ | mov L:RB, CARG1 // Caveat: CARG1 may be RA.
+ | mov SAVE_L, CARG1
+ | mov RA, CARG2
+ | mov PCd, FRAME_CP
+ | xor RDd, RDd
+ | lea KBASE, [esp+CFRAME_RESUME]
+ | mov DISPATCH, L:RB->glref // Setup pointer to dispatch table.
+ | add DISPATCH, GG_G2DISP
+ | mov SAVE_PC, RD // Any value outside of bytecode is ok.
+ | mov SAVE_CFRAME, RD
+ | mov SAVE_NRES, RDd
+ | mov SAVE_ERRF, RDd
+ | mov L:RB->cframe, KBASE
+ | cmp byte L:RB->status, RDL
+ | je >2 // Initial resume (like a call).
+ |
+ | // Resume after yield (like a return).
+ | mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+ | set_vmstate INTERP
+ | mov byte L:RB->status, RDL
+ | mov BASE, L:RB->base
+ | mov RD, L:RB->top
+ | sub RD, RA
+ | shr RDd, 3
+ | add RDd, 1 // RD = nresults+1
+ | sub RA, BASE // RA = resultofs
+ | mov PC, [BASE-8]
+ | mov MULTRES, RDd
+ | test PCd, FRAME_TYPE
+ | jz ->BC_RET_Z
+ | jmp ->vm_return
+ |
+ |->vm_pcall: // Setup protected C frame and enter VM.
+ | // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
+ | saveregs
+ | mov PCd, FRAME_CP
+ | mov SAVE_ERRF, CARG4d
+ | jmp >1
+ |
+ |->vm_call: // Setup C frame and enter VM.
+ | // (lua_State *L, TValue *base, int nres1)
+ | saveregs
+ | mov PCd, FRAME_C
+ |
+ |1: // Entry point for vm_pcall above (PC = ftype).
+ | mov SAVE_NRES, CARG3d
+ | mov L:RB, CARG1 // Caveat: CARG1 may be RA.
+ | mov SAVE_L, CARG1
+ | mov RA, CARG2
+ |
+ | mov DISPATCH, L:RB->glref // Setup pointer to dispatch table.
+ | mov KBASE, L:RB->cframe // Add our C frame to cframe chain.
+ | mov SAVE_CFRAME, KBASE
+ | mov SAVE_PC, L:RB // Any value outside of bytecode is ok.
+ | add DISPATCH, GG_G2DISP
+ | mov L:RB->cframe, rsp
+ |
+ |2: // Entry point for vm_resume/vm_cpcall (RA = base, RB = L, PC = ftype).
+ | mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+ | set_vmstate INTERP
+ | mov BASE, L:RB->base // BASE = old base (used in vmeta_call).
+ | add PC, RA
+ | sub PC, BASE // PC = frame delta + frame type
+ |
+ | mov RD, L:RB->top
+ | sub RD, RA
+ | shr NARGS:RDd, 3
+ | add NARGS:RDd, 1 // RD = nargs+1
+ |
+ |->vm_call_dispatch:
+ | mov LFUNC:RB, [RA-16]
+ | checkfunc LFUNC:RB, ->vmeta_call // Ensure KBASE defined and != BASE.
+ |
+ |->vm_call_dispatch_f:
+ | mov BASE, RA
+ | ins_call
+ | // BASE = new base, RB = func, RD = nargs+1, PC = caller PC
+ |
+ |->vm_cpcall: // Setup protected C frame, call C.
+ | // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
+ | saveregs
+ | mov L:RB, CARG1 // Caveat: CARG1 may be RA.
+ | mov SAVE_L, CARG1
+ | mov SAVE_PC, L:RB // Any value outside of bytecode is ok.
+ |
+ | mov KBASE, L:RB->stack // Compute -savestack(L, L->top).
+ | sub KBASE, L:RB->top
+ | mov DISPATCH, L:RB->glref // Setup pointer to dispatch table.
+ | mov SAVE_ERRF, 0 // No error function.
+ | mov SAVE_NRES, KBASEd // Neg. delta means cframe w/o frame.
+ | add DISPATCH, GG_G2DISP
+ | // Handler may change cframe_nres(L->cframe) or cframe_errfunc(L->cframe).
+ |
+ | mov KBASE, L:RB->cframe // Add our C frame to cframe chain.
+ | mov SAVE_CFRAME, KBASE
+ | mov L:RB->cframe, rsp
+ | mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+ |
+ | call CARG4 // (lua_State *L, lua_CFunction func, void *ud)
+ | // TValue * (new base) or NULL returned in eax (RC).
+ | test RC, RC
+ | jz ->vm_leave_cp // No base? Just remove C frame.
+ | mov RA, RC
+ | mov PCd, FRAME_CP
+ | jmp <2 // Else continue with the call.
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Metamethod handling ------------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |//-- Continuation dispatch ----------------------------------------------
+ |
+ |->cont_dispatch:
+ | // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
+ | add RA, BASE
+ | and PC, -8
+ | mov RB, BASE
+ | sub BASE, PC // Restore caller BASE.
+ | mov aword [RA+RD*8-8], LJ_TNIL // Ensure one valid arg.
+ | mov RC, RA // ... in [RC]
+ | mov PC, [RB-24] // Restore PC from [cont|PC].
+ | mov RA, qword [RB-32] // May be negative on WIN64 with debug.
+ |.if FFI
+ | cmp RA, 1
+ | jbe >1
+ |.endif
+ | mov LFUNC:KBASE, [BASE-16]
+ | cleartp LFUNC:KBASE
+ | mov KBASE, LFUNC:KBASE->pc
+ | mov KBASE, [KBASE+PC2PROTO(k)]
+ | // BASE = base, RC = result, RB = meta base
+ | jmp RA // Jump to continuation.
+ |
+ |.if FFI
+ |1:
+ | je ->cont_ffi_callback // cont = 1: return from FFI callback.
+ | // cont = 0: Tail call from C function.
+ | sub RB, BASE
+ | shr RBd, 3
+ | lea RDd, [RBd-3]
+ | jmp ->vm_call_tail
+ |.endif
+ |
+ |->cont_cat: // BASE = base, RC = result, RB = mbase
+ | movzx RAd, PC_RB
+ | sub RB, 32
+ | lea RA, [BASE+RA*8]
+ | sub RA, RB
+ | je ->cont_ra
+ | neg RA
+ | shr RAd, 3
+ |.if X64WIN
+ | mov CARG3d, RAd
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE
+ | mov RC, [RC]
+ | mov [RB], RC
+ | mov CARG2, RB
+ |.else
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE
+ | mov CARG3d, RAd
+ | mov RA, [RC]
+ | mov [RB], RA
+ | mov CARG2, RB
+ |.endif
+ | jmp ->BC_CAT_Z
+ |
+ |//-- Table indexing metamethods -----------------------------------------
+ |
+ |->vmeta_tgets:
+ | settp STR:RC, LJ_TSTR // STR:RC = GCstr *
+ | mov TMP1, STR:RC
+ | lea RC, TMP1
+ | cmp PC_OP, BC_GGET
+ | jne >1
+ | settp TAB:RA, TAB:RB, LJ_TTAB // TAB:RB = GCtab *
+ | lea RB, [DISPATCH+DISPATCH_GL(tmptv)] // Store fn->l.env in g->tmptv.
+ | mov [RB], TAB:RA
+ | jmp >2
+ |
+ |->vmeta_tgetb:
+ | movzx RCd, PC_RC
+ |.if DUALNUM
+ | setint RC
+ | mov TMP1, RC
+ |.else
+ | cvtsi2sd xmm0, RCd
+ | movsd TMP1, xmm0
+ |.endif
+ | lea RC, TMP1
+ | jmp >1
+ |
+ |->vmeta_tgetv:
+ | movzx RCd, PC_RC // Reload TValue *k from RC.
+ | lea RC, [BASE+RC*8]
+ |1:
+ | movzx RBd, PC_RB // Reload TValue *t from RB.
+ | lea RB, [BASE+RB*8]
+ |2:
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE // Caveat: CARG2/CARG3 may be BASE.
+ | mov CARG2, RB
+ | mov CARG3, RC
+ | mov L:RB, L:CARG1
+ | mov SAVE_PC, PC
+ | call extern lj_meta_tget // (lua_State *L, TValue *o, TValue *k)
+ | // TValue * (finished) or NULL (metamethod) returned in eax (RC).
+ | mov BASE, L:RB->base
+ | test RC, RC
+ | jz >3
+ |->cont_ra: // BASE = base, RC = result
+ | movzx RAd, PC_RA
+ | mov RB, [RC]
+ | mov [BASE+RA*8], RB
+ | ins_next
+ |
+ |3: // Call __index metamethod.
+ | // BASE = base, L->top = new base, stack = cont/func/t/k
+ | mov RA, L:RB->top
+ | mov [RA-24], PC // [cont|PC]
+ | lea PC, [RA+FRAME_CONT]
+ | sub PC, BASE
+ | mov LFUNC:RB, [RA-16] // Guaranteed to be a function here.
+ | mov NARGS:RDd, 2+1 // 2 args for func(t, k).
+ | cleartp LFUNC:RB
+ | jmp ->vm_call_dispatch_f
+ |
+ |->vmeta_tgetr:
+ | mov CARG1, TAB:RB
+ | mov RB, BASE // Save BASE.
+ | mov CARG2d, RCd // Caveat: CARG2 == BASE
+ | call extern lj_tab_getinth // (GCtab *t, int32_t key)
+ | // cTValue * or NULL returned in eax (RC).
+ | movzx RAd, PC_RA
+ | mov BASE, RB // Restore BASE.
+ | test RC, RC
+ | jnz ->BC_TGETR_Z
+ | mov ITYPE, LJ_TNIL
+ | jmp ->BC_TGETR2_Z
+ |
+ |//-----------------------------------------------------------------------
+ |
+ |->vmeta_tsets:
+ | settp STR:RC, LJ_TSTR // STR:RC = GCstr *
+ | mov TMP1, STR:RC
+ | lea RC, TMP1
+ | cmp PC_OP, BC_GSET
+ | jne >1
+ | settp TAB:RA, TAB:RB, LJ_TTAB // TAB:RB = GCtab *
+ | lea RB, [DISPATCH+DISPATCH_GL(tmptv)] // Store fn->l.env in g->tmptv.
+ | mov [RB], TAB:RA
+ | jmp >2
+ |
+ |->vmeta_tsetb:
+ | movzx RCd, PC_RC
+ |.if DUALNUM
+ | setint RC
+ | mov TMP1, RC
+ |.else
+ | cvtsi2sd xmm0, RCd
+ | movsd TMP1, xmm0
+ |.endif
+ | lea RC, TMP1
+ | jmp >1
+ |
+ |->vmeta_tsetv:
+ | movzx RCd, PC_RC // Reload TValue *k from RC.
+ | lea RC, [BASE+RC*8]
+ |1:
+ | movzx RBd, PC_RB // Reload TValue *t from RB.
+ | lea RB, [BASE+RB*8]
+ |2:
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE // Caveat: CARG2/CARG3 may be BASE.
+ | mov CARG2, RB
+ | mov CARG3, RC
+ | mov L:RB, L:CARG1
+ | mov SAVE_PC, PC
+ | call extern lj_meta_tset // (lua_State *L, TValue *o, TValue *k)
+ | // TValue * (finished) or NULL (metamethod) returned in eax (RC).
+ | mov BASE, L:RB->base
+ | test RC, RC
+ | jz >3
+ | // NOBARRIER: lj_meta_tset ensures the table is not black.
+ | movzx RAd, PC_RA
+ | mov RB, [BASE+RA*8]
+ | mov [RC], RB
+ |->cont_nop: // BASE = base, (RC = result)
+ | ins_next
+ |
+ |3: // Call __newindex metamethod.
+ | // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
+ | mov RA, L:RB->top
+ | mov [RA-24], PC // [cont|PC]
+ | movzx RCd, PC_RA
+ | // Copy value to third argument.
+ | mov RB, [BASE+RC*8]
+ | mov [RA+16], RB
+ | lea PC, [RA+FRAME_CONT]
+ | sub PC, BASE
+ | mov LFUNC:RB, [RA-16] // Guaranteed to be a function here.
+ | mov NARGS:RDd, 3+1 // 3 args for func(t, k, v).
+ | cleartp LFUNC:RB
+ | jmp ->vm_call_dispatch_f
+ |
+ |->vmeta_tsetr:
+ |.if X64WIN
+ | mov L:CARG1, SAVE_L
+ | mov CARG3d, RCd
+ | mov L:CARG1->base, BASE
+ | xchg CARG2, TAB:RB // Caveat: CARG2 == BASE.
+ |.else
+ | mov L:CARG1, SAVE_L
+ | mov CARG2, TAB:RB
+ | mov L:CARG1->base, BASE
+ | mov RB, BASE // Save BASE.
+ | mov CARG3d, RCd // Caveat: CARG3 == BASE.
+ |.endif
+ | mov SAVE_PC, PC
+ | call extern lj_tab_setinth // (lua_State *L, GCtab *t, int32_t key)
+ | // TValue * returned in eax (RC).
+ | movzx RAd, PC_RA
+ | mov BASE, RB // Restore BASE.
+ | jmp ->BC_TSETR_Z
+ |
+ |//-- Comparison metamethods ---------------------------------------------
+ |
+ |->vmeta_comp:
+ | movzx RDd, PC_RD
+ | movzx RAd, PC_RA
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Caveat: CARG2/CARG3 == BASE.
+ |.if X64WIN
+ | lea CARG3, [BASE+RD*8]
+ | lea CARG2, [BASE+RA*8]
+ |.else
+ | lea CARG2, [BASE+RA*8]
+ | lea CARG3, [BASE+RD*8]
+ |.endif
+ | mov CARG1, L:RB // Caveat: CARG1/CARG4 == RA.
+ | movzx CARG4d, PC_OP
+ | mov SAVE_PC, PC
+ | call extern lj_meta_comp // (lua_State *L, TValue *o1, *o2, int op)
+ | // 0/1 or TValue * (metamethod) returned in eax (RC).
+ |3:
+ | mov BASE, L:RB->base
+ | cmp RC, 1
+ | ja ->vmeta_binop
+ |4:
+ | lea PC, [PC+4]
+ | jb >6
+ |5:
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |6:
+ | ins_next
+ |
+ |->cont_condt: // BASE = base, RC = result
+ | add PC, 4
+ | mov ITYPE, [RC]
+ | sar ITYPE, 47
+ | cmp ITYPEd, LJ_TISTRUECOND // Branch if result is true.
+ | jb <5
+ | jmp <6
+ |
+ |->cont_condf: // BASE = base, RC = result
+ | mov ITYPE, [RC]
+ | sar ITYPE, 47
+ | cmp ITYPEd, LJ_TISTRUECOND // Branch if result is false.
+ | jmp <4
+ |
+ |->vmeta_equal:
+ | cleartp TAB:RD
+ | sub PC, 4
+ |.if X64WIN
+ | mov CARG3, RD
+ | mov CARG4d, RBd
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Caveat: CARG2 == BASE.
+ | mov CARG2, RA
+ | mov CARG1, L:RB // Caveat: CARG1 == RA.
+ |.else
+ | mov CARG2, RA
+ | mov CARG4d, RBd // Caveat: CARG4 == RA.
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Caveat: CARG3 == BASE.
+ | mov CARG3, RD
+ | mov CARG1, L:RB
+ |.endif
+ | mov SAVE_PC, PC
+ | call extern lj_meta_equal // (lua_State *L, GCobj *o1, *o2, int ne)
+ | // 0/1 or TValue * (metamethod) returned in eax (RC).
+ | jmp <3
+ |
+ |->vmeta_equal_cd:
+ |.if FFI
+ | sub PC, 4
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov CARG1, L:RB
+ | mov CARG2d, dword [PC-4]
+ | mov SAVE_PC, PC
+ | call extern lj_meta_equal_cd // (lua_State *L, BCIns ins)
+ | // 0/1 or TValue * (metamethod) returned in eax (RC).
+ | jmp <3
+ |.endif
+ |
+ |->vmeta_istype:
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Caveat: CARG2/CARG3 may be BASE.
+ | mov CARG2d, RAd
+ | mov CARG3d, RDd
+ | mov L:CARG1, L:RB
+ | mov SAVE_PC, PC
+ | call extern lj_meta_istype // (lua_State *L, BCReg ra, BCReg tp)
+ | mov BASE, L:RB->base
+ | jmp <6
+ |
+ |//-- Arithmetic metamethods ---------------------------------------------
+ |
+ |->vmeta_arith_vno:
+ |.if DUALNUM
+ | movzx RBd, PC_RB
+ | movzx RCd, PC_RC
+ |.endif
+ |->vmeta_arith_vn:
+ | lea RC, [KBASE+RC*8]
+ | jmp >1
+ |
+ |->vmeta_arith_nvo:
+ |.if DUALNUM
+ | movzx RBd, PC_RB
+ | movzx RCd, PC_RC
+ |.endif
+ |->vmeta_arith_nv:
+ | lea TMPR, [KBASE+RC*8]
+ | lea RC, [BASE+RB*8]
+ | mov RB, TMPR
+ | jmp >2
+ |
+ |->vmeta_unm:
+ | lea RC, [BASE+RD*8]
+ | mov RB, RC
+ | jmp >2
+ |
+ |->vmeta_arith_vvo:
+ |.if DUALNUM
+ | movzx RBd, PC_RB
+ | movzx RCd, PC_RC
+ |.endif
+ |->vmeta_arith_vv:
+ | lea RC, [BASE+RC*8]
+ |1:
+ | lea RB, [BASE+RB*8]
+ |2:
+ | lea RA, [BASE+RA*8]
+ |.if X64WIN
+ | mov CARG3, RB
+ | mov CARG4, RC
+ | movzx RCd, PC_OP
+ | mov ARG5d, RCd
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Caveat: CARG2 == BASE.
+ | mov CARG2, RA
+ | mov CARG1, L:RB // Caveat: CARG1 == RA.
+ |.else
+ | movzx CARG5d, PC_OP
+ | mov CARG2, RA
+ | mov CARG4, RC // Caveat: CARG4 == RA.
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE // Caveat: CARG3 == BASE.
+ | mov CARG3, RB
+ | mov L:RB, L:CARG1
+ |.endif
+ | mov SAVE_PC, PC
+ | call extern lj_meta_arith // (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
+ | // NULL (finished) or TValue * (metamethod) returned in eax (RC).
+ | mov BASE, L:RB->base
+ | test RC, RC
+ | jz ->cont_nop
+ |
+ | // Call metamethod for binary op.
+ |->vmeta_binop:
+ | // BASE = base, RC = new base, stack = cont/func/o1/o2
+ | mov RA, RC
+ | sub RC, BASE
+ | mov [RA-24], PC // [cont|PC]
+ | lea PC, [RC+FRAME_CONT]
+ | mov NARGS:RDd, 2+1 // 2 args for func(o1, o2).
+ | jmp ->vm_call_dispatch
+ |
+ |->vmeta_len:
+ | movzx RDd, PC_RD
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | lea CARG2, [BASE+RD*8] // Caveat: CARG2 == BASE
+ | mov L:CARG1, L:RB
+ | mov SAVE_PC, PC
+ | call extern lj_meta_len // (lua_State *L, TValue *o)
+ | // NULL (retry) or TValue * (metamethod) returned in eax (RC).
+ | mov BASE, L:RB->base
+#if LJ_52
+ | test RC, RC
+ | jne ->vmeta_binop // Binop call for compatibility.
+ | movzx RDd, PC_RD
+ | mov TAB:CARG1, [BASE+RD*8]
+ | cleartp TAB:CARG1
+ | jmp ->BC_LEN_Z
+#else
+ | jmp ->vmeta_binop // Binop call for compatibility.
+#endif
+ |
+ |//-- Call metamethod ----------------------------------------------------
+ |
+ |->vmeta_call_ra:
+ | lea RA, [BASE+RA*8+16]
+ |->vmeta_call: // Resolve and call __call metamethod.
+ | // BASE = old base, RA = new base, RC = nargs+1, PC = return
+ | mov TMP1d, NARGS:RDd // Save RA, RC for us.
+ | mov RB, RA
+ |.if X64WIN
+ | mov L:TMPR, SAVE_L
+ | mov L:TMPR->base, BASE // Caveat: CARG2 is BASE.
+ | lea CARG2, [RA-16]
+ | lea CARG3, [RA+NARGS:RD*8-8]
+ | mov CARG1, L:TMPR // Caveat: CARG1 is RA.
+ |.else
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE // Caveat: CARG3 is BASE.
+ | lea CARG2, [RA-16]
+ | lea CARG3, [RA+NARGS:RD*8-8]
+ |.endif
+ | mov SAVE_PC, PC
+ | call extern lj_meta_call // (lua_State *L, TValue *func, TValue *top)
+ | mov RA, RB
+ | mov L:RB, SAVE_L
+ | mov BASE, L:RB->base
+ | mov NARGS:RDd, TMP1d
+ | mov LFUNC:RB, [RA-16]
+ | cleartp LFUNC:RB
+ | add NARGS:RDd, 1
+ | // This is fragile. L->base must not move, KBASE must always be defined.
+ | cmp KBASE, BASE // Continue with CALLT if flag set.
+ | je ->BC_CALLT_Z
+ | mov BASE, RA
+ | ins_call // Otherwise call resolved metamethod.
+ |
+ |//-- Argument coercion for 'for' statement ------------------------------
+ |
+ |->vmeta_for:
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov CARG2, RA // Caveat: CARG2 == BASE
+ | mov L:CARG1, L:RB // Caveat: CARG1 == RA
+ | mov SAVE_PC, PC
+ | call extern lj_meta_for // (lua_State *L, TValue *base)
+ | mov BASE, L:RB->base
+ | mov RCd, [PC-4]
+ | movzx RAd, RCH
+ | movzx OP, RCL
+ | shr RCd, 16
+ | jmp aword [DISPATCH+OP*8+GG_DISP2STATIC] // Retry FORI or JFORI.
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Fast functions -----------------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |.macro .ffunc, name
+ |->ff_ .. name:
+ |.endmacro
+ |
+ |.macro .ffunc_1, name
+ |->ff_ .. name:
+ | cmp NARGS:RDd, 1+1; jb ->fff_fallback
+ |.endmacro
+ |
+ |.macro .ffunc_2, name
+ |->ff_ .. name:
+ | cmp NARGS:RDd, 2+1; jb ->fff_fallback
+ |.endmacro
+ |
+ |.macro .ffunc_n, name, op
+ | .ffunc_1 name
+ | checknumtp [BASE], ->fff_fallback
+ | op xmm0, qword [BASE]
+ |.endmacro
+ |
+ |.macro .ffunc_n, name
+ | .ffunc_n name, movsd
+ |.endmacro
+ |
+ |.macro .ffunc_nn, name
+ | .ffunc_2 name
+ | checknumtp [BASE], ->fff_fallback
+ | checknumtp [BASE+8], ->fff_fallback
+ | movsd xmm0, qword [BASE]
+ | movsd xmm1, qword [BASE+8]
+ |.endmacro
+ |
+ |// Inlined GC threshold check. Caveat: uses label 1.
+ |.macro ffgccheck
+ | mov RB, [DISPATCH+DISPATCH_GL(gc.total)]
+ | cmp RB, [DISPATCH+DISPATCH_GL(gc.threshold)]
+ | jb >1
+ | call ->fff_gcstep
+ |1:
+ |.endmacro
+ |
+ |//-- Base library: checks -----------------------------------------------
+ |
+ |.ffunc_1 assert
+ | mov ITYPE, [BASE]
+ | mov RB, ITYPE
+ | sar ITYPE, 47
+ | cmp ITYPEd, LJ_TISTRUECOND; jae ->fff_fallback
+ | mov PC, [BASE-8]
+ | mov MULTRES, RDd
+ | mov RB, [BASE]
+ | mov [BASE-16], RB
+ | sub RDd, 2
+ | jz >2
+ | mov RA, BASE
+ |1:
+ | add RA, 8
+ | mov RB, [RA]
+ | mov [RA-16], RB
+ | sub RDd, 1
+ | jnz <1
+ |2:
+ | mov RDd, MULTRES
+ | jmp ->fff_res_
+ |
+ |.ffunc_1 type
+ | mov RC, [BASE]
+ | sar RC, 47
+ | mov RBd, LJ_TISNUM
+ | cmp RCd, RBd
+ | cmovb RCd, RBd
+ | not RCd
+ |2:
+ | mov CFUNC:RB, [BASE-16]
+ | cleartp CFUNC:RB
+ | mov STR:RC, [CFUNC:RB+RC*8+((char *)(&((GCfuncC *)0)->upvalue))]
+ | mov PC, [BASE-8]
+ | settp STR:RC, LJ_TSTR
+ | mov [BASE-16], STR:RC
+ | jmp ->fff_res1
+ |
+ |//-- Base library: getters and setters ---------------------------------
+ |
+ |.ffunc_1 getmetatable
+ | mov TAB:RB, [BASE]
+ | mov PC, [BASE-8]
+ | checktab TAB:RB, >6
+ |1: // Field metatable must be at same offset for GCtab and GCudata!
+ | mov TAB:RB, TAB:RB->metatable
+ |2:
+ | test TAB:RB, TAB:RB
+ | mov aword [BASE-16], LJ_TNIL
+ | jz ->fff_res1
+ | settp TAB:RC, TAB:RB, LJ_TTAB
+ | mov [BASE-16], TAB:RC // Store metatable as default result.
+ | mov STR:RC, [DISPATCH+DISPATCH_GL(gcroot)+8*(GCROOT_MMNAME+MM_metatable)]
+ | mov RAd, TAB:RB->hmask
+ | and RAd, STR:RC->hash
+ | settp STR:RC, LJ_TSTR
+ | imul RAd, #NODE
+ | add NODE:RA, TAB:RB->node
+ |3: // Rearranged logic, because we expect _not_ to find the key.
+ | cmp NODE:RA->key, STR:RC
+ | je >5
+ |4:
+ | mov NODE:RA, NODE:RA->next
+ | test NODE:RA, NODE:RA
+ | jnz <3
+ | jmp ->fff_res1 // Not found, keep default result.
+ |5:
+ | mov RB, NODE:RA->val
+ | cmp RB, LJ_TNIL; je ->fff_res1 // Ditto for nil value.
+ | mov [BASE-16], RB // Return value of mt.__metatable.
+ | jmp ->fff_res1
+ |
+ |6:
+ | cmp ITYPEd, LJ_TUDATA; je <1
+ | cmp ITYPEd, LJ_TISNUM; ja >7
+ | mov ITYPEd, LJ_TISNUM
+ |7:
+ | not ITYPEd
+ | mov TAB:RB, [DISPATCH+ITYPE*8+DISPATCH_GL(gcroot[GCROOT_BASEMT])]
+ | jmp <2
+ |
+ |.ffunc_2 setmetatable
+ | mov TAB:RB, [BASE]
+ | mov TAB:TMPR, TAB:RB
+ | checktab TAB:RB, ->fff_fallback
+ | // Fast path: no mt for table yet and not clearing the mt.
+ | cmp aword TAB:RB->metatable, 0; jne ->fff_fallback
+ | mov TAB:RA, [BASE+8]
+ | checktab TAB:RA, ->fff_fallback
+ | mov TAB:RB->metatable, TAB:RA
+ | mov PC, [BASE-8]
+ | mov [BASE-16], TAB:TMPR // Return original table.
+ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
+ | jz >1
+ | // Possible write barrier. Table is black, but skip iswhite(mt) check.
+ | barrierback TAB:RB, RC
+ |1:
+ | jmp ->fff_res1
+ |
+ |.ffunc_2 rawget
+ |.if X64WIN
+ | mov TAB:RA, [BASE]
+ | checktab TAB:RA, ->fff_fallback
+ | mov RB, BASE // Save BASE.
+ | lea CARG3, [BASE+8]
+ | mov CARG2, TAB:RA // Caveat: CARG2 == BASE.
+ | mov CARG1, SAVE_L
+ |.else
+ | mov TAB:CARG2, [BASE]
+ | checktab TAB:CARG2, ->fff_fallback
+ | mov RB, BASE // Save BASE.
+ | lea CARG3, [BASE+8] // Caveat: CARG3 == BASE.
+ | mov CARG1, SAVE_L
+ |.endif
+ | call extern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key)
+ | // cTValue * returned in eax (RD).
+ | mov BASE, RB // Restore BASE.
+ | // Copy table slot.
+ | mov RB, [RD]
+ | mov PC, [BASE-8]
+ | mov [BASE-16], RB
+ | jmp ->fff_res1
+ |
+ |//-- Base library: conversions ------------------------------------------
+ |
+ |.ffunc tonumber
+ | // Only handles the number case inline (without a base argument).
+ | cmp NARGS:RDd, 1+1; jne ->fff_fallback // Exactly one argument.
+ | mov RB, [BASE]
+ | checknumber RB, ->fff_fallback
+ | mov PC, [BASE-8]
+ | mov [BASE-16], RB
+ | jmp ->fff_res1
+ |
+ |.ffunc_1 tostring
+ | // Only handles the string or number case inline.
+ | mov PC, [BASE-8]
+ | mov STR:RB, [BASE]
+ | checktp_nc STR:RB, LJ_TSTR, >3
+ | // A __tostring method in the string base metatable is ignored.
+ |2:
+ | mov [BASE-16], STR:RB
+ | jmp ->fff_res1
+ |3: // Handle numbers inline, unless a number base metatable is present.
+ | cmp ITYPEd, LJ_TISNUM; ja ->fff_fallback_1
+ | cmp aword [DISPATCH+DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])], 0
+ | jne ->fff_fallback
+ | ffgccheck // Caveat: uses label 1.
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Add frame since C call can throw.
+ | mov SAVE_PC, PC // Redundant (but a defined value).
+ |.if not X64WIN
+ | mov CARG2, BASE // Otherwise: CARG2 == BASE
+ |.endif
+ | mov L:CARG1, L:RB
+ |.if DUALNUM
+ | call extern lj_strfmt_number // (lua_State *L, cTValue *o)
+ |.else
+ | call extern lj_strfmt_num // (lua_State *L, lua_Number *np)
+ |.endif
+ | // GCstr returned in eax (RD).
+ | mov BASE, L:RB->base
+ | settp STR:RB, RD, LJ_TSTR
+ | jmp <2
+ |
+ |//-- Base library: iterators -------------------------------------------
+ |
+ |.ffunc_1 next
+ | je >2 // Missing 2nd arg?
+ |1:
+ |.if X64WIN
+ | mov RA, [BASE]
+ | checktab RA, ->fff_fallback
+ |.else
+ | mov CARG2, [BASE]
+ | checktab CARG2, ->fff_fallback
+ |.endif
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Add frame since C call can throw.
+ | mov L:RB->top, BASE // Dummy frame length is ok.
+ | mov PC, [BASE-8]
+ |.if X64WIN
+ | lea CARG3, [BASE+8]
+ | mov CARG2, RA // Caveat: CARG2 == BASE.
+ | mov CARG1, L:RB
+ |.else
+ | lea CARG3, [BASE+8] // Caveat: CARG3 == BASE.
+ | mov CARG1, L:RB
+ |.endif
+ | mov SAVE_PC, PC // Needed for ITERN fallback.
+ | call extern lj_tab_next // (lua_State *L, GCtab *t, TValue *key)
+ | // Flag returned in eax (RD).
+ | mov BASE, L:RB->base
+ | test RDd, RDd; jz >3 // End of traversal?
+ | // Copy key and value to results.
+ | mov RB, [BASE+8]
+ | mov RD, [BASE+16]
+ | mov [BASE-16], RB
+ | mov [BASE-8], RD
+ |->fff_res2:
+ | mov RDd, 1+2
+ | jmp ->fff_res
+ |2: // Set missing 2nd arg to nil.
+ | mov aword [BASE+8], LJ_TNIL
+ | jmp <1
+ |3: // End of traversal: return nil.
+ | mov aword [BASE-16], LJ_TNIL
+ | jmp ->fff_res1
+ |
+ |.ffunc_1 pairs
+ | mov TAB:RB, [BASE]
+ | mov TMPR, TAB:RB
+ | checktab TAB:RB, ->fff_fallback
+#if LJ_52
+ | cmp aword TAB:RB->metatable, 0; jne ->fff_fallback
+#endif
+ | mov CFUNC:RD, [BASE-16]
+ | cleartp CFUNC:RD
+ | mov CFUNC:RD, CFUNC:RD->upvalue[0]
+ | settp CFUNC:RD, LJ_TFUNC
+ | mov PC, [BASE-8]
+ | mov [BASE-16], CFUNC:RD
+ | mov [BASE-8], TMPR
+ | mov aword [BASE], LJ_TNIL
+ | mov RDd, 1+3
+ | jmp ->fff_res
+ |
+ |.ffunc_2 ipairs_aux
+ | mov TAB:RB, [BASE]
+ | checktab TAB:RB, ->fff_fallback
+ |.if DUALNUM
+ | mov RA, [BASE+8]
+ | checkint RA, ->fff_fallback
+ |.else
+ | checknumtp [BASE+8], ->fff_fallback
+ | movsd xmm0, qword [BASE+8]
+ |.endif
+ | mov PC, [BASE-8]
+ |.if DUALNUM
+ | add RAd, 1
+ | setint ITYPE, RA
+ | mov [BASE-16], ITYPE
+ |.else
+ | sseconst_1 xmm1, TMPR
+ | addsd xmm0, xmm1
+ | cvttsd2si RAd, xmm0
+ | movsd qword [BASE-16], xmm0
+ |.endif
+ | cmp RAd, TAB:RB->asize; jae >2 // Not in array part?
+ | mov RD, TAB:RB->array
+ | lea RD, [RD+RA*8]
+ |1:
+ | cmp aword [RD], LJ_TNIL; je ->fff_res0
+ | // Copy array slot.
+ | mov RB, [RD]
+ | mov [BASE-8], RB
+ | jmp ->fff_res2
+ |2: // Check for empty hash part first. Otherwise call C function.
+ | cmp dword TAB:RB->hmask, 0; je ->fff_res0
+ |.if X64WIN
+ | mov TMPR, BASE
+ | mov CARG2d, RAd
+ | mov CARG1, TAB:RB
+ | mov RB, TMPR
+ |.else
+ | mov CARG1, TAB:RB
+ | mov RB, BASE // Save BASE.
+ | mov CARG2d, RAd // Caveat: CARG2 == BASE
+ |.endif
+ | call extern lj_tab_getinth // (GCtab *t, int32_t key)
+ | // cTValue * or NULL returned in eax (RD).
+ | mov BASE, RB
+ | test RD, RD
+ | jnz <1
+ |->fff_res0:
+ | mov RDd, 1+0
+ | jmp ->fff_res
+ |
+ |.ffunc_1 ipairs
+ | mov TAB:RB, [BASE]
+ | mov TMPR, TAB:RB
+ | checktab TAB:RB, ->fff_fallback
+#if LJ_52
+ | cmp aword TAB:RB->metatable, 0; jne ->fff_fallback
+#endif
+ | mov CFUNC:RD, [BASE-16]
+ | cleartp CFUNC:RD
+ | mov CFUNC:RD, CFUNC:RD->upvalue[0]
+ | settp CFUNC:RD, LJ_TFUNC
+ | mov PC, [BASE-8]
+ | mov [BASE-16], CFUNC:RD
+ | mov [BASE-8], TMPR
+ |.if DUALNUM
+ | mov64 RD, ((int64_t)LJ_TISNUM<<47)
+ | mov [BASE], RD
+ |.else
+ | mov qword [BASE], 0
+ |.endif
+ | mov RDd, 1+3
+ | jmp ->fff_res
+ |
+ |//-- Base library: catch errors ----------------------------------------
+ |
+ |.ffunc_1 pcall
+ | lea RA, [BASE+16]
+ | sub NARGS:RDd, 1
+ | mov PCd, 16+FRAME_PCALL
+ |1:
+ | movzx RBd, byte [DISPATCH+DISPATCH_GL(hookmask)]
+ | shr RB, HOOK_ACTIVE_SHIFT
+ | and RB, 1
+ | add PC, RB // Remember active hook before pcall.
+ | // Note: this does a (harmless) copy of the function to the PC slot, too.
+ | mov KBASE, RD
+ |2:
+ | mov RB, [RA+KBASE*8-24]
+ | mov [RA+KBASE*8-16], RB
+ | sub KBASE, 1
+ | ja <2
+ | jmp ->vm_call_dispatch
+ |
+ |.ffunc_2 xpcall
+ | mov LFUNC:RA, [BASE+8]
+ | checktp_nc LFUNC:RA, LJ_TFUNC, ->fff_fallback
+ | mov LFUNC:RB, [BASE] // Swap function and traceback.
+ | mov [BASE], LFUNC:RA
+ | mov [BASE+8], LFUNC:RB
+ | lea RA, [BASE+24]
+ | sub NARGS:RDd, 2
+ | mov PCd, 24+FRAME_PCALL
+ | jmp <1
+ |
+ |//-- Coroutine library --------------------------------------------------
+ |
+ |.macro coroutine_resume_wrap, resume
+ |.if resume
+ |.ffunc_1 coroutine_resume
+ | mov L:RB, [BASE]
+ | cleartp L:RB
+ |.else
+ |.ffunc coroutine_wrap_aux
+ | mov CFUNC:RB, [BASE-16]
+ | cleartp CFUNC:RB
+ | mov L:RB, CFUNC:RB->upvalue[0].gcr
+ | cleartp L:RB
+ |.endif
+ | mov PC, [BASE-8]
+ | mov SAVE_PC, PC
+ | mov TMP1, L:RB
+ |.if resume
+ | checktptp [BASE], LJ_TTHREAD, ->fff_fallback
+ |.endif
+ | cmp aword L:RB->cframe, 0; jne ->fff_fallback
+ | cmp byte L:RB->status, LUA_YIELD; ja ->fff_fallback
+ | mov RA, L:RB->top
+ | je >1 // Status != LUA_YIELD (i.e. 0)?
+ | cmp RA, L:RB->base // Check for presence of initial func.
+ | je ->fff_fallback
+ | mov PC, [RA-8] // Move initial function up.
+ | mov [RA], PC
+ | add RA, 8
+ |1:
+ |.if resume
+ | lea PC, [RA+NARGS:RD*8-16] // Check stack space (-1-thread).
+ |.else
+ | lea PC, [RA+NARGS:RD*8-8] // Check stack space (-1).
+ |.endif
+ | cmp PC, L:RB->maxstack; ja ->fff_fallback
+ | mov L:RB->top, PC
+ |
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ |.if resume
+ | add BASE, 8 // Keep resumed thread in stack for GC.
+ |.endif
+ | mov L:RB->top, BASE
+ |.if resume
+ | lea RB, [BASE+NARGS:RD*8-24] // RB = end of source for stack move.
+ |.else
+ | lea RB, [BASE+NARGS:RD*8-16] // RB = end of source for stack move.
+ |.endif
+ | sub RB, PC // Relative to PC.
+ |
+ | cmp PC, RA
+ | je >3
+ |2: // Move args to coroutine.
+ | mov RC, [PC+RB]
+ | mov [PC-8], RC
+ | sub PC, 8
+ | cmp PC, RA
+ | jne <2
+ |3:
+ | mov CARG2, RA
+ | mov CARG1, TMP1
+ | call ->vm_resume // (lua_State *L, TValue *base, 0, 0)
+ |
+ | mov L:RB, SAVE_L
+ | mov L:PC, TMP1
+ | mov BASE, L:RB->base
+ | mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+ | set_vmstate INTERP
+ |
+ | cmp eax, LUA_YIELD
+ | ja >8
+ |4:
+ | mov RA, L:PC->base
+ | mov KBASE, L:PC->top
+ | mov L:PC->top, RA // Clear coroutine stack.
+ | mov PC, KBASE
+ | sub PC, RA
+ | je >6 // No results?
+ | lea RD, [BASE+PC]
+ | shr PCd, 3
+ | cmp RD, L:RB->maxstack
+ | ja >9 // Need to grow stack?
+ |
+ | mov RB, BASE
+ | sub RB, RA
+ |5: // Move results from coroutine.
+ | mov RD, [RA]
+ | mov [RA+RB], RD
+ | add RA, 8
+ | cmp RA, KBASE
+ | jne <5
+ |6:
+ |.if resume
+ | lea RDd, [PCd+2] // nresults+1 = 1 + true + results.
+ | mov_true ITYPE // Prepend true to results.
+ | mov [BASE-8], ITYPE
+ |.else
+ | lea RDd, [PCd+1] // nresults+1 = 1 + results.
+ |.endif
+ |7:
+ | mov PC, SAVE_PC
+ | mov MULTRES, RDd
+ |.if resume
+ | mov RA, -8
+ |.else
+ | xor RAd, RAd
+ |.endif
+ | test PCd, FRAME_TYPE
+ | jz ->BC_RET_Z
+ | jmp ->vm_return
+ |
+ |8: // Coroutine returned with error (at co->top-1).
+ |.if resume
+ | mov_false ITYPE // Prepend false to results.
+ | mov [BASE-8], ITYPE
+ | mov RA, L:PC->top
+ | sub RA, 8
+ | mov L:PC->top, RA // Clear error from coroutine stack.
+ | // Copy error message.
+ | mov RD, [RA]
+ | mov [BASE], RD
+ | mov RDd, 1+2 // nresults+1 = 1 + false + error.
+ | jmp <7
+ |.else
+ | mov CARG2, L:PC
+ | mov CARG1, L:RB
+ | call extern lj_ffh_coroutine_wrap_err // (lua_State *L, lua_State *co)
+ | // Error function does not return.
+ |.endif
+ |
+ |9: // Handle stack expansion on return from yield.
+ | mov L:RA, TMP1
+ | mov L:RA->top, KBASE // Undo coroutine stack clearing.
+ | mov CARG2, PC
+ | mov CARG1, L:RB
+ | call extern lj_state_growstack // (lua_State *L, int n)
+ | mov L:PC, TMP1
+ | mov BASE, L:RB->base
+ | jmp <4 // Retry the stack move.
+ |.endmacro
+ |
+ | coroutine_resume_wrap 1 // coroutine.resume
+ | coroutine_resume_wrap 0 // coroutine.wrap
+ |
+ |.ffunc coroutine_yield
+ | mov L:RB, SAVE_L
+ | test aword L:RB->cframe, CFRAME_RESUME
+ | jz ->fff_fallback
+ | mov L:RB->base, BASE
+ | lea RD, [BASE+NARGS:RD*8-8]
+ | mov L:RB->top, RD
+ | xor RDd, RDd
+ | mov aword L:RB->cframe, RD
+ | mov al, LUA_YIELD
+ | mov byte L:RB->status, al
+ | jmp ->vm_leave_unw
+ |
+ |//-- Math library -------------------------------------------------------
+ |
+ | .ffunc_1 math_abs
+ | mov RB, [BASE]
+ |.if DUALNUM
+ | checkint RB, >3
+ | cmp RBd, 0; jns ->fff_resi
+ | neg RBd; js >2
+ |->fff_resbit:
+ |->fff_resi:
+ | setint RB
+ |->fff_resRB:
+ | mov PC, [BASE-8]
+ | mov [BASE-16], RB
+ | jmp ->fff_res1
+ |2:
+ | mov64 RB, U64x(41e00000,00000000) // 2^31.
+ | jmp ->fff_resRB
+ |3:
+ | ja ->fff_fallback
+ |.else
+ | checknum RB, ->fff_fallback
+ |.endif
+ | shl RB, 1
+ | shr RB, 1
+ | mov PC, [BASE-8]
+ | mov [BASE-16], RB
+ | jmp ->fff_res1
+ |
+ |.ffunc_n math_sqrt, sqrtsd
+ |->fff_resxmm0:
+ | mov PC, [BASE-8]
+ | movsd qword [BASE-16], xmm0
+ | // fallthrough
+ |
+ |->fff_res1:
+ | mov RDd, 1+1
+ |->fff_res:
+ | mov MULTRES, RDd
+ |->fff_res_:
+ | test PCd, FRAME_TYPE
+ | jnz >7
+ |5:
+ | cmp PC_RB, RDL // More results expected?
+ | ja >6
+ | // Adjust BASE. KBASE is assumed to be set for the calling frame.
+ | movzx RAd, PC_RA
+ | neg RA
+ | lea BASE, [BASE+RA*8-16] // base = base - (RA+2)*8
+ | ins_next
+ |
+ |6: // Fill up results with nil.
+ | mov aword [BASE+RD*8-24], LJ_TNIL
+ | add RD, 1
+ | jmp <5
+ |
+ |7: // Non-standard return case.
+ | mov RA, -16 // Results start at BASE+RA = BASE-16.
+ | jmp ->vm_return
+ |
+ |.macro math_round, func
+ | .ffunc math_ .. func
+ |.if DUALNUM
+ | mov RB, [BASE]
+ | checknumx RB, ->fff_resRB, je
+ | ja ->fff_fallback
+ |.else
+ | checknumtp [BASE], ->fff_fallback
+ |.endif
+ | movsd xmm0, qword [BASE]
+ | call ->vm_ .. func .. _sse
+ |.if DUALNUM
+ | cvttsd2si RBd, xmm0
+ | cmp RBd, 0x80000000
+ | jne ->fff_resi
+ | cvtsi2sd xmm1, RBd
+ | ucomisd xmm0, xmm1
+ | jp ->fff_resxmm0
+ | je ->fff_resi
+ |.endif
+ | jmp ->fff_resxmm0
+ |.endmacro
+ |
+ | math_round floor
+ | math_round ceil
+ |
+ |.ffunc math_log
+ | cmp NARGS:RDd, 1+1; jne ->fff_fallback // Exactly one argument.
+ | checknumtp [BASE], ->fff_fallback
+ | movsd xmm0, qword [BASE]
+ | mov RB, BASE
+ | call extern log
+ | mov BASE, RB
+ | jmp ->fff_resxmm0
+ |
+ |.macro math_extern, func
+ | .ffunc_n math_ .. func
+ | mov RB, BASE
+ | call extern func
+ | mov BASE, RB
+ | jmp ->fff_resxmm0
+ |.endmacro
+ |
+ |.macro math_extern2, func
+ | .ffunc_nn math_ .. func
+ | mov RB, BASE
+ | call extern func
+ | mov BASE, RB
+ | jmp ->fff_resxmm0
+ |.endmacro
+ |
+ | math_extern log10
+ | math_extern exp
+ | math_extern sin
+ | math_extern cos
+ | math_extern tan
+ | math_extern asin
+ | math_extern acos
+ | math_extern atan
+ | math_extern sinh
+ | math_extern cosh
+ | math_extern tanh
+ | math_extern2 pow
+ | math_extern2 atan2
+ | math_extern2 fmod
+ |
+ |.ffunc_2 math_ldexp
+ | checknumtp [BASE], ->fff_fallback
+ | checknumtp [BASE+8], ->fff_fallback
+ | fld qword [BASE+8]
+ | fld qword [BASE]
+ | fscale
+ | fpop1
+ | mov PC, [BASE-8]
+ | fstp qword [BASE-16]
+ | jmp ->fff_res1
+ |
+ |.ffunc_n math_frexp
+ | lea CARG1, TMP1
+ | mov RB, BASE
+ | call extern frexp
+ | mov BASE, RB
+ | mov RBd, TMP1d
+ | mov PC, [BASE-8]
+ | movsd qword [BASE-16], xmm0
+ |.if DUALNUM
+ | setint RB
+ | mov [BASE-8], RB
+ |.else
+ | cvtsi2sd xmm1, RBd
+ | movsd qword [BASE-8], xmm1
+ |.endif
+ | mov RDd, 1+2
+ | jmp ->fff_res
+ |
+ |.ffunc_n math_modf
+ | lea CARG1, [BASE-16]
+ | mov PC, [BASE-8]
+ | mov RB, BASE
+ | call extern modf
+ | mov BASE, RB
+ | mov PC, [BASE-8]
+ | movsd qword [BASE-8], xmm0
+ | mov RDd, 1+2
+ | jmp ->fff_res
+ |
+ |.macro math_minmax, name, cmovop, sseop
+ | .ffunc name
+ | mov RAd, 2
+ |.if DUALNUM
+ | mov RB, [BASE]
+ | checkint RB, >4
+ |1: // Handle integers.
+ | cmp RAd, RDd; jae ->fff_resRB
+ | mov TMPR, [BASE+RA*8-8]
+ | checkint TMPR, >3
+ | cmp RBd, TMPRd
+ | cmovop RB, TMPR
+ | add RAd, 1
+ | jmp <1
+ |3:
+ | ja ->fff_fallback
+ | // Convert intermediate result to number and continue below.
+ | cvtsi2sd xmm0, RBd
+ | jmp >6
+ |4:
+ | ja ->fff_fallback
+ |.else
+ | checknumtp [BASE], ->fff_fallback
+ |.endif
+ |
+ | movsd xmm0, qword [BASE]
+ |5: // Handle numbers or integers.
+ | cmp RAd, RDd; jae ->fff_resxmm0
+ |.if DUALNUM
+ | mov RB, [BASE+RA*8-8]
+ | checknumx RB, >6, jb
+ | ja ->fff_fallback
+ | cvtsi2sd xmm1, RBd
+ | jmp >7
+ |.else
+ | checknumtp [BASE+RA*8-8], ->fff_fallback
+ |.endif
+ |6:
+ | movsd xmm1, qword [BASE+RA*8-8]
+ |7:
+ | sseop xmm0, xmm1
+ | add RAd, 1
+ | jmp <5
+ |.endmacro
+ |
+ | math_minmax math_min, cmovg, minsd
+ | math_minmax math_max, cmovl, maxsd
+ |
+ |//-- String library -----------------------------------------------------
+ |
+ |.ffunc string_byte // Only handle the 1-arg case here.
+ | cmp NARGS:RDd, 1+1; jne ->fff_fallback
+ | mov STR:RB, [BASE]
+ | checkstr STR:RB, ->fff_fallback
+ | mov PC, [BASE-8]
+ | cmp dword STR:RB->len, 1
+ | jb ->fff_res0 // Return no results for empty string.
+ | movzx RBd, byte STR:RB[1]
+ |.if DUALNUM
+ | jmp ->fff_resi
+ |.else
+ | cvtsi2sd xmm0, RBd; jmp ->fff_resxmm0
+ |.endif
+ |
+ |.ffunc string_char // Only handle the 1-arg case here.
+ | ffgccheck
+ | cmp NARGS:RDd, 1+1; jne ->fff_fallback // *Exactly* 1 arg.
+ |.if DUALNUM
+ | mov RB, [BASE]
+ | checkint RB, ->fff_fallback
+ |.else
+ | checknumtp [BASE], ->fff_fallback
+ | cvttsd2si RBd, qword [BASE]
+ |.endif
+ | cmp RBd, 255; ja ->fff_fallback
+ | mov TMP1d, RBd
+ | mov TMPRd, 1
+ | lea RD, TMP1 // Points to stack. Little-endian.
+ |->fff_newstr:
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov CARG3d, TMPRd // Zero-extended to size_t.
+ | mov CARG2, RD
+ | mov CARG1, L:RB
+ | mov SAVE_PC, PC
+ | call extern lj_str_new // (lua_State *L, char *str, size_t l)
+ |->fff_resstr:
+ | // GCstr * returned in eax (RD).
+ | mov BASE, L:RB->base
+ | mov PC, [BASE-8]
+ | settp STR:RD, LJ_TSTR
+ | mov [BASE-16], STR:RD
+ | jmp ->fff_res1
+ |
+ |.ffunc string_sub
+ | ffgccheck
+ | mov TMPRd, -1
+ | cmp NARGS:RDd, 1+2; jb ->fff_fallback
+ | jna >1
+ |.if DUALNUM
+ | mov TMPR, [BASE+16]
+ | checkint TMPR, ->fff_fallback
+ |.else
+ | checknumtp [BASE+16], ->fff_fallback
+ | cvttsd2si TMPRd, qword [BASE+16]
+ |.endif
+ |1:
+ | mov STR:RB, [BASE]
+ | checkstr STR:RB, ->fff_fallback
+ |.if DUALNUM
+ | mov ITYPE, [BASE+8]
+ | mov RAd, ITYPEd // Must clear hiword for lea below.
+ | sar ITYPE, 47
+ | cmp ITYPEd, LJ_TISNUM
+ | jne ->fff_fallback
+ |.else
+ | checknumtp [BASE+8], ->fff_fallback
+ | cvttsd2si RAd, qword [BASE+8]
+ |.endif
+ | mov RCd, STR:RB->len
+ | cmp RCd, TMPRd // len < end? (unsigned compare)
+ | jb >5
+ |2:
+ | test RAd, RAd // start <= 0?
+ | jle >7
+ |3:
+ | sub TMPRd, RAd // start > end?
+ | jl ->fff_emptystr
+ | lea RD, [STR:RB+RAd+#STR-1]
+ | add TMPRd, 1
+ |4:
+ | jmp ->fff_newstr
+ |
+ |5: // Negative end or overflow.
+ | jl >6
+ | lea TMPRd, [TMPRd+RCd+1] // end = end+(len+1)
+ | jmp <2
+ |6: // Overflow.
+ | mov TMPRd, RCd // end = len
+ | jmp <2
+ |
+ |7: // Negative start or underflow.
+ | je >8
+ | add RAd, RCd // start = start+(len+1)
+ | add RAd, 1
+ | jg <3 // start > 0?
+ |8: // Underflow.
+ | mov RAd, 1 // start = 1
+ | jmp <3
+ |
+ |->fff_emptystr: // Range underflow.
+ | xor TMPRd, TMPRd // Zero length. Any ptr in RD is ok.
+ | jmp <4
+ |
+ |.macro ffstring_op, name
+ | .ffunc_1 string_ .. name
+ | ffgccheck
+ |.if X64WIN
+ | mov STR:TMPR, [BASE]
+ | checkstr STR:TMPR, ->fff_fallback
+ |.else
+ | mov STR:CARG2, [BASE]
+ | checkstr STR:CARG2, ->fff_fallback
+ |.endif
+ | mov L:RB, SAVE_L
+ | lea SBUF:CARG1, [DISPATCH+DISPATCH_GL(tmpbuf)]
+ | mov L:RB->base, BASE
+ |.if X64WIN
+ | mov STR:CARG2, STR:TMPR // Caveat: CARG2 == BASE
+ |.endif
+ | mov RC, SBUF:CARG1->b
+ | mov SBUF:CARG1->L, L:RB
+ | mov SBUF:CARG1->p, RC
+ | mov SAVE_PC, PC
+ | call extern lj_buf_putstr_ .. name
+ | mov CARG1, rax
+ | call extern lj_buf_tostr
+ | jmp ->fff_resstr
+ |.endmacro
+ |
+ |ffstring_op reverse
+ |ffstring_op lower
+ |ffstring_op upper
+ |
+ |//-- Bit library --------------------------------------------------------
+ |
+ |.macro .ffunc_bit, name, kind, fdef
+ | fdef name
+ |.if kind == 2
+ | sseconst_tobit xmm1, RB
+ |.endif
+ |.if DUALNUM
+ | mov RB, [BASE]
+ | checkint RB, >1
+ |.if kind > 0
+ | jmp >2
+ |.else
+ | jmp ->fff_resbit
+ |.endif
+ |1:
+ | ja ->fff_fallback
+ | movd xmm0, RB
+ |.else
+ | checknumtp [BASE], ->fff_fallback
+ | movsd xmm0, qword [BASE]
+ |.endif
+ |.if kind < 2
+ | sseconst_tobit xmm1, RB
+ |.endif
+ | addsd xmm0, xmm1
+ | movd RBd, xmm0
+ |2:
+ |.endmacro
+ |
+ |.macro .ffunc_bit, name, kind
+ | .ffunc_bit name, kind, .ffunc_1
+ |.endmacro
+ |
+ |.ffunc_bit bit_tobit, 0
+ | jmp ->fff_resbit
+ |
+ |.macro .ffunc_bit_op, name, ins
+ | .ffunc_bit name, 2
+ | mov TMPRd, NARGS:RDd // Save for fallback.
+ | lea RD, [BASE+NARGS:RD*8-16]
+ |1:
+ | cmp RD, BASE
+ | jbe ->fff_resbit
+ |.if DUALNUM
+ | mov RA, [RD]
+ | checkint RA, >2
+ | ins RBd, RAd
+ | sub RD, 8
+ | jmp <1
+ |2:
+ | ja ->fff_fallback_bit_op
+ | movd xmm0, RA
+ |.else
+ | checknumtp [RD], ->fff_fallback_bit_op
+ | movsd xmm0, qword [RD]
+ |.endif
+ | addsd xmm0, xmm1
+ | movd RAd, xmm0
+ | ins RBd, RAd
+ | sub RD, 8
+ | jmp <1
+ |.endmacro
+ |
+ |.ffunc_bit_op bit_band, and
+ |.ffunc_bit_op bit_bor, or
+ |.ffunc_bit_op bit_bxor, xor
+ |
+ |.ffunc_bit bit_bswap, 1
+ | bswap RBd
+ | jmp ->fff_resbit
+ |
+ |.ffunc_bit bit_bnot, 1
+ | not RBd
+ |.if DUALNUM
+ | jmp ->fff_resbit
+ |.else
+ |->fff_resbit:
+ | cvtsi2sd xmm0, RBd
+ | jmp ->fff_resxmm0
+ |.endif
+ |
+ |->fff_fallback_bit_op:
+ | mov NARGS:RDd, TMPRd // Restore for fallback
+ | jmp ->fff_fallback
+ |
+ |.macro .ffunc_bit_sh, name, ins
+ |.if DUALNUM
+ | .ffunc_bit name, 1, .ffunc_2
+ | // Note: no inline conversion from number for 2nd argument!
+ | mov RA, [BASE+8]
+ | checkint RA, ->fff_fallback
+ |.else
+ | .ffunc_nn name
+ | sseconst_tobit xmm2, RB
+ | addsd xmm0, xmm2
+ | addsd xmm1, xmm2
+ | movd RBd, xmm0
+ | movd RAd, xmm1
+ |.endif
+ | ins RBd, cl // Assumes RA is ecx.
+ | jmp ->fff_resbit
+ |.endmacro
+ |
+ |.ffunc_bit_sh bit_lshift, shl
+ |.ffunc_bit_sh bit_rshift, shr
+ |.ffunc_bit_sh bit_arshift, sar
+ |.ffunc_bit_sh bit_rol, rol
+ |.ffunc_bit_sh bit_ror, ror
+ |
+ |//-----------------------------------------------------------------------
+ |
+ |->fff_fallback_2:
+ | mov NARGS:RDd, 1+2 // Other args are ignored, anyway.
+ | jmp ->fff_fallback
+ |->fff_fallback_1:
+ | mov NARGS:RDd, 1+1 // Other args are ignored, anyway.
+ |->fff_fallback: // Call fast function fallback handler.
+ | // BASE = new base, RD = nargs+1
+ | mov L:RB, SAVE_L
+ | mov PC, [BASE-8] // Fallback may overwrite PC.
+ | mov SAVE_PC, PC // Redundant (but a defined value).
+ | mov L:RB->base, BASE
+ | lea RD, [BASE+NARGS:RD*8-8]
+ | lea RA, [RD+8*LUA_MINSTACK] // Ensure enough space for handler.
+ | mov L:RB->top, RD
+ | mov CFUNC:RD, [BASE-16]
+ | cleartp CFUNC:RD
+ | cmp RA, L:RB->maxstack
+ | ja >5 // Need to grow stack.
+ | mov CARG1, L:RB
+ | call aword CFUNC:RD->f // (lua_State *L)
+ | mov BASE, L:RB->base
+ | // Either throws an error, or recovers and returns -1, 0 or nresults+1.
+ | test RDd, RDd; jg ->fff_res // Returned nresults+1?
+ |1:
+ | mov RA, L:RB->top
+ | sub RA, BASE
+ | shr RAd, 3
+ | test RDd, RDd
+ | lea NARGS:RDd, [RAd+1]
+ | mov LFUNC:RB, [BASE-16]
+ | jne ->vm_call_tail // Returned -1?
+ | cleartp LFUNC:RB
+ | ins_callt // Returned 0: retry fast path.
+ |
+ |// Reconstruct previous base for vmeta_call during tailcall.
+ |->vm_call_tail:
+ | mov RA, BASE
+ | test PCd, FRAME_TYPE
+ | jnz >3
+ | movzx RBd, PC_RA
+ | neg RB
+ | lea BASE, [BASE+RB*8-16] // base = base - (RB+2)*8
+ | jmp ->vm_call_dispatch // Resolve again for tailcall.
+ |3:
+ | mov RB, PC
+ | and RB, -8
+ | sub BASE, RB
+ | jmp ->vm_call_dispatch // Resolve again for tailcall.
+ |
+ |5: // Grow stack for fallback handler.
+ | mov CARG2d, LUA_MINSTACK
+ | mov CARG1, L:RB
+ | call extern lj_state_growstack // (lua_State *L, int n)
+ | mov BASE, L:RB->base
+ | xor RDd, RDd // Simulate a return 0.
+ | jmp <1 // Dumb retry (goes through ff first).
+ |
+ |->fff_gcstep: // Call GC step function.
+ | // BASE = new base, RD = nargs+1
+ | pop RB // Must keep stack at same level.
+ | mov TMP1, RB // Save return address
+ | mov L:RB, SAVE_L
+ | mov SAVE_PC, PC // Redundant (but a defined value).
+ | mov L:RB->base, BASE
+ | lea RD, [BASE+NARGS:RD*8-8]
+ | mov CARG1, L:RB
+ | mov L:RB->top, RD
+ | call extern lj_gc_step // (lua_State *L)
+ | mov BASE, L:RB->base
+ | mov RD, L:RB->top
+ | sub RD, BASE
+ | shr RDd, 3
+ | add NARGS:RDd, 1
+ | mov RB, TMP1
+ | push RB // Restore return address.
+ | ret
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Special dispatch targets -------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |->vm_record: // Dispatch target for recording phase.
+ |.if JIT
+ | movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
+ | test RDL, HOOK_VMEVENT // No recording while in vmevent.
+ | jnz >5
+ | // Decrement the hookcount for consistency, but always do the call.
+ | test RDL, HOOK_ACTIVE
+ | jnz >1
+ | test RDL, LUA_MASKLINE|LUA_MASKCOUNT
+ | jz >1
+ | dec dword [DISPATCH+DISPATCH_GL(hookcount)]
+ | jmp >1
+ |.endif
+ |
+ |->vm_rethook: // Dispatch target for return hooks.
+ | movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
+ | test RDL, HOOK_ACTIVE // Hook already active?
+ | jnz >5
+ | jmp >1
+ |
+ |->vm_inshook: // Dispatch target for instr/line hooks.
+ | movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
+ | test RDL, HOOK_ACTIVE // Hook already active?
+ | jnz >5
+ |
+ | test RDL, LUA_MASKLINE|LUA_MASKCOUNT
+ | jz >5
+ | dec dword [DISPATCH+DISPATCH_GL(hookcount)]
+ | jz >1
+ | test RDL, LUA_MASKLINE
+ | jz >5
+ |1:
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov CARG2, PC // Caveat: CARG2 == BASE
+ | mov CARG1, L:RB
+ | // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
+ | call extern lj_dispatch_ins // (lua_State *L, const BCIns *pc)
+ |3:
+ | mov BASE, L:RB->base
+ |4:
+ | movzx RAd, PC_RA
+ |5:
+ | movzx OP, PC_OP
+ | movzx RDd, PC_RD
+ | jmp aword [DISPATCH+OP*8+GG_DISP2STATIC] // Re-dispatch to static ins.
+ |
+ |->cont_hook: // Continue from hook yield.
+ | add PC, 4
+ | mov RA, [RB-40]
+ | mov MULTRES, RAd // Restore MULTRES for *M ins.
+ | jmp <4
+ |
+ |->vm_hotloop: // Hot loop counter underflow.
+ |.if JIT
+ | mov LFUNC:RB, [BASE-16] // Same as curr_topL(L).
+ | cleartp LFUNC:RB
+ | mov RB, LFUNC:RB->pc
+ | movzx RDd, byte [RB+PC2PROTO(framesize)]
+ | lea RD, [BASE+RD*8]
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov L:RB->top, RD
+ | mov CARG2, PC
+ | lea CARG1, [DISPATCH+GG_DISP2J]
+ | mov aword [DISPATCH+DISPATCH_J(L)], L:RB
+ | mov SAVE_PC, PC
+ | call extern lj_trace_hot // (jit_State *J, const BCIns *pc)
+ | jmp <3
+ |.endif
+ |
+ |->vm_callhook: // Dispatch target for call hooks.
+ | mov SAVE_PC, PC
+ |.if JIT
+ | jmp >1
+ |.endif
+ |
+ |->vm_hotcall: // Hot call counter underflow.
+ |.if JIT
+ | mov SAVE_PC, PC
+ | or PC, 1 // Marker for hot call.
+ |1:
+ |.endif
+ | lea RD, [BASE+NARGS:RD*8-8]
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov L:RB->top, RD
+ | mov CARG2, PC
+ | mov CARG1, L:RB
+ | call extern lj_dispatch_call // (lua_State *L, const BCIns *pc)
+ | // ASMFunction returned in eax/rax (RD).
+ | mov SAVE_PC, 0 // Invalidate for subsequent line hook.
+ |.if JIT
+ | and PC, -2
+ |.endif
+ | mov BASE, L:RB->base
+ | mov RA, RD
+ | mov RD, L:RB->top
+ | sub RD, BASE
+ | mov RB, RA
+ | movzx RAd, PC_RA
+ | shr RDd, 3
+ | add NARGS:RDd, 1
+ | jmp RB
+ |
+ |->cont_stitch: // Trace stitching.
+ |.if JIT
+ | // BASE = base, RC = result, RB = mbase
+ | mov ITYPEd, [RB-24] // Save previous trace number.
+ | mov TMPRd, MULTRES
+ | movzx RAd, PC_RA
+ | lea RA, [BASE+RA*8] // Call base.
+ | sub TMPRd, 1
+ | jz >2
+ |1: // Move results down.
+ | mov RB, [RC]
+ | mov [RA], RB
+ | add RC, 8
+ | add RA, 8
+ | sub TMPRd, 1
+ | jnz <1
+ |2:
+ | movzx RCd, PC_RA
+ | movzx RBd, PC_RB
+ | add RC, RB
+ | lea RC, [BASE+RC*8-8]
+ |3:
+ | cmp RC, RA
+ | ja >9 // More results wanted?
+ |
+ | mov RA, [DISPATCH+DISPATCH_J(trace)]
+ | mov TRACE:RD, [RA+ITYPE*8]
+ | test TRACE:RD, TRACE:RD
+ | jz ->cont_nop
+ | movzx RDd, word TRACE:RD->link
+ | cmp RDd, RBd
+ | je ->cont_nop // Blacklisted.
+ | test RDd, RDd
+ | jne =>BC_JLOOP // Jump to stitched trace.
+ |
+ | // Stitch a new trace to the previous trace.
+ | mov [DISPATCH+DISPATCH_J(exitno)], RB
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov CARG2, PC
+ | lea CARG1, [DISPATCH+GG_DISP2J]
+ | mov aword [DISPATCH+DISPATCH_J(L)], L:RB
+ | call extern lj_dispatch_stitch // (jit_State *J, const BCIns *pc)
+ | mov BASE, L:RB->base
+ | jmp ->cont_nop
+ |
+ |9: // Fill up results with nil.
+ | mov aword [RA], LJ_TNIL
+ | add RA, 8
+ | jmp <3
+ |.endif
+ |
+ |->vm_profhook: // Dispatch target for profiler hook.
+#if LJ_HASPROFILE
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov CARG2, PC // Caveat: CARG2 == BASE
+ | mov CARG1, L:RB
+ | call extern lj_dispatch_profile // (lua_State *L, const BCIns *pc)
+ | mov BASE, L:RB->base
+ | // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
+ | sub PC, 4
+ | jmp ->cont_nop
+#endif
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Trace exit handler -------------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |// Called from an exit stub with the exit number on the stack.
+ |// The 16 bit exit number is stored with two (sign-extended) push imm8.
+ |->vm_exit_handler:
+ |.if JIT
+ | push r13; push r12
+ | push r11; push r10; push r9; push r8
+ | push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp
+ | push rbx; push rdx; push rcx; push rax
+ | movzx RCd, byte [rbp-8] // Reconstruct exit number.
+ | mov RCH, byte [rbp-16]
+ | mov [rbp-8], r15; mov [rbp-16], r14
+ | // Caveat: DISPATCH is rbx.
+ | mov DISPATCH, [ebp]
+ | mov RA, [DISPATCH+DISPATCH_GL(vmstate)] // Get trace number.
+ | set_vmstate EXIT
+ | mov [DISPATCH+DISPATCH_J(exitno)], RC
+ | mov [DISPATCH+DISPATCH_J(parent)], RA
+ |.if X64WIN
+ | sub rsp, 16*8+4*8 // Room for SSE regs + save area.
+ |.else
+ | sub rsp, 16*8 // Room for SSE regs.
+ |.endif
+ | add rbp, -128
+ | movsd qword [rbp-8], xmm15; movsd qword [rbp-16], xmm14
+ | movsd qword [rbp-24], xmm13; movsd qword [rbp-32], xmm12
+ | movsd qword [rbp-40], xmm11; movsd qword [rbp-48], xmm10
+ | movsd qword [rbp-56], xmm9; movsd qword [rbp-64], xmm8
+ | movsd qword [rbp-72], xmm7; movsd qword [rbp-80], xmm6
+ | movsd qword [rbp-88], xmm5; movsd qword [rbp-96], xmm4
+ | movsd qword [rbp-104], xmm3; movsd qword [rbp-112], xmm2
+ | movsd qword [rbp-120], xmm1; movsd qword [rbp-128], xmm0
+ | // Caveat: RB is rbp.
+ | mov L:RB, [DISPATCH+DISPATCH_GL(cur_L)]
+ | mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
+ | mov aword [DISPATCH+DISPATCH_J(L)], L:RB
+ | mov L:RB->base, BASE
+ |.if X64WIN
+ | lea CARG2, [rsp+4*8]
+ |.else
+ | mov CARG2, rsp
+ |.endif
+ | lea CARG1, [DISPATCH+GG_DISP2J]
+ | mov dword [DISPATCH+DISPATCH_GL(jit_base)], 0
+ | call extern lj_trace_exit // (jit_State *J, ExitState *ex)
+ | // MULTRES or negated error code returned in eax (RD).
+ | mov RA, L:RB->cframe
+ | and RA, CFRAME_RAWMASK
+ | mov [RA+CFRAME_OFS_L], L:RB // Set SAVE_L (on-trace resume/yield).
+ | mov BASE, L:RB->base
+ | mov PC, [RA+CFRAME_OFS_PC] // Get SAVE_PC.
+ | jmp >1
+ |.endif
+ |->vm_exit_interp:
+ | // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
+ |.if JIT
+ | // Restore additional callee-save registers only used in compiled code.
+ |.if X64WIN
+ | lea RA, [rsp+10*16+4*8]
+ |1:
+ | movdqa xmm15, [RA-10*16]
+ | movdqa xmm14, [RA-9*16]
+ | movdqa xmm13, [RA-8*16]
+ | movdqa xmm12, [RA-7*16]
+ | movdqa xmm11, [RA-6*16]
+ | movdqa xmm10, [RA-5*16]
+ | movdqa xmm9, [RA-4*16]
+ | movdqa xmm8, [RA-3*16]
+ | movdqa xmm7, [RA-2*16]
+ | mov rsp, RA // Reposition stack to C frame.
+ | movdqa xmm6, [RA-1*16]
+ | mov r15, CSAVE_1
+ | mov r14, CSAVE_2
+ | mov r13, CSAVE_3
+ | mov r12, CSAVE_4
+ |.else
+ | lea RA, [rsp+16]
+ |1:
+ | mov r13, [RA-8]
+ | mov r12, [RA]
+ | mov rsp, RA // Reposition stack to C frame.
+ |.endif
+ | test RDd, RDd; js >9 // Check for error from exit.
+ | mov L:RB, SAVE_L
+ | mov MULTRES, RDd
+ | mov LFUNC:KBASE, [BASE-16]
+ | cleartp LFUNC:KBASE
+ | mov KBASE, LFUNC:KBASE->pc
+ | mov KBASE, [KBASE+PC2PROTO(k)]
+ | mov L:RB->base, BASE
+ | mov dword [DISPATCH+DISPATCH_GL(jit_base)], 0
+ | set_vmstate INTERP
+ | // Modified copy of ins_next which handles function header dispatch, too.
+ | mov RCd, [PC]
+ | movzx RAd, RCH
+ | movzx OP, RCL
+ | add PC, 4
+ | shr RCd, 16
+ | cmp OP, BC_FUNCF // Function header?
+ | jb >3
+ | cmp OP, BC_FUNCC+2 // Fast function?
+ | jae >4
+ |2:
+ | mov RCd, MULTRES // RC/RD holds nres+1.
+ |3:
+ | jmp aword [DISPATCH+OP*8]
+ |
+ |4: // Check frame below fast function.
+ | mov RC, [BASE-8]
+ | test RCd, FRAME_TYPE
+ | jnz <2 // Trace stitching continuation?
+ | // Otherwise set KBASE for Lua function below fast function.
+ | movzx RCd, byte [RC-3]
+ | neg RC
+ | mov LFUNC:KBASE, [BASE+RC*8-24]
+ | cleartp LFUNC:KBASE
+ | mov KBASE, LFUNC:KBASE->pc
+ | mov KBASE, [KBASE+PC2PROTO(k)]
+ | jmp <2
+ |
+ |9: // Rethrow error from the right C frame.
+ | neg RD
+ | mov CARG1, L:RB
+ | mov CARG2, RD
+ | call extern lj_err_throw // (lua_State *L, int errcode)
+ |.endif
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Math helper functions ----------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |// FP value rounding. Called by math.floor/math.ceil fast functions
+ |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
+ |.macro vm_round, name, mode, cond
+ |->name:
+ |->name .. _sse:
+ | sseconst_abs xmm2, RD
+ | sseconst_2p52 xmm3, RD
+ | movaps xmm1, xmm0
+ | andpd xmm1, xmm2 // |x|
+ | ucomisd xmm3, xmm1 // No truncation if 2^52 <= |x|.
+ | jbe >1
+ | andnpd xmm2, xmm0 // Isolate sign bit.
+ |.if mode == 2 // trunc(x)?
+ | movaps xmm0, xmm1
+ | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52
+ | subsd xmm1, xmm3
+ | sseconst_1 xmm3, RD
+ | cmpsd xmm0, xmm1, 1 // |x| < result?
+ | andpd xmm0, xmm3
+ | subsd xmm1, xmm0 // If yes, subtract -1.
+ | orpd xmm1, xmm2 // Merge sign bit back in.
+ |.else
+ | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52
+ | subsd xmm1, xmm3
+ | orpd xmm1, xmm2 // Merge sign bit back in.
+ | .if mode == 1 // ceil(x)?
+ | sseconst_m1 xmm2, RD // Must subtract -1 to preserve -0.
+ | cmpsd xmm0, xmm1, 6 // x > result?
+ | .else // floor(x)?
+ | sseconst_1 xmm2, RD
+ | cmpsd xmm0, xmm1, 1 // x < result?
+ | .endif
+ | andpd xmm0, xmm2
+ | subsd xmm1, xmm0 // If yes, subtract +-1.
+ |.endif
+ | movaps xmm0, xmm1
+ |1:
+ | ret
+ |.endmacro
+ |
+ | vm_round vm_floor, 0, 1
+ | vm_round vm_ceil, 1, JIT
+ | vm_round vm_trunc, 2, JIT
+ |
+ |// FP modulo x%y. Called by BC_MOD* and vm_arith.
+ |->vm_mod:
+ |// Args in xmm0/xmm1, return value in xmm0.
+ |// Caveat: xmm0-xmm5 and RC (eax) modified!
+ | movaps xmm5, xmm0
+ | divsd xmm0, xmm1
+ | sseconst_abs xmm2, RD
+ | sseconst_2p52 xmm3, RD
+ | movaps xmm4, xmm0
+ | andpd xmm4, xmm2 // |x/y|
+ | ucomisd xmm3, xmm4 // No truncation if 2^52 <= |x/y|.
+ | jbe >1
+ | andnpd xmm2, xmm0 // Isolate sign bit.
+ | addsd xmm4, xmm3 // (|x/y| + 2^52) - 2^52
+ | subsd xmm4, xmm3
+ | orpd xmm4, xmm2 // Merge sign bit back in.
+ | sseconst_1 xmm2, RD
+ | cmpsd xmm0, xmm4, 1 // x/y < result?
+ | andpd xmm0, xmm2
+ | subsd xmm4, xmm0 // If yes, subtract 1.0.
+ | movaps xmm0, xmm5
+ | mulsd xmm1, xmm4
+ | subsd xmm0, xmm1
+ | ret
+ |1:
+ | mulsd xmm1, xmm0
+ | movaps xmm0, xmm5
+ | subsd xmm0, xmm1
+ | ret
+ |
+ |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
+ |->vm_powi_sse:
+ | cmp eax, 1; jle >6 // i<=1?
+ | // Now 1 < (unsigned)i <= 0x80000000.
+ |1: // Handle leading zeros.
+ | test eax, 1; jnz >2
+ | mulsd xmm0, xmm0
+ | shr eax, 1
+ | jmp <1
+ |2:
+ | shr eax, 1; jz >5
+ | movaps xmm1, xmm0
+ |3: // Handle trailing bits.
+ | mulsd xmm0, xmm0
+ | shr eax, 1; jz >4
+ | jnc <3
+ | mulsd xmm1, xmm0
+ | jmp <3
+ |4:
+ | mulsd xmm0, xmm1
+ |5:
+ | ret
+ |6:
+ | je <5 // x^1 ==> x
+ | jb >7 // x^0 ==> 1
+ | neg eax
+ | call <1
+ | sseconst_1 xmm1, RD
+ | divsd xmm1, xmm0
+ | movaps xmm0, xmm1
+ | ret
+ |7:
+ | sseconst_1 xmm0, RD
+ | ret
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Miscellaneous functions --------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
+ |->vm_cpuid:
+ | mov eax, CARG1d
+ | .if X64WIN; push rsi; mov rsi, CARG2; .endif
+ | push rbx
+ | cpuid
+ | mov [rsi], eax
+ | mov [rsi+4], ebx
+ | mov [rsi+8], ecx
+ | mov [rsi+12], edx
+ | pop rbx
+ | .if X64WIN; pop rsi; .endif
+ | ret
+ |
+ |//-----------------------------------------------------------------------
+ |//-- Assertions ---------------------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |->assert_bad_for_arg_type:
+#ifdef LUA_USE_ASSERT
+ | int3
+#endif
+ | int3
+ |
+ |//-----------------------------------------------------------------------
+ |//-- FFI helper functions -----------------------------------------------
+ |//-----------------------------------------------------------------------
+ |
+ |// Handler for callback functions. Callback slot number in ah/al.
+ |->vm_ffi_callback:
+ |.if FFI
+ |.type CTSTATE, CTState, PC
+ | saveregs_ // ebp/rbp already saved. ebp now holds global_State *.
+ | lea DISPATCH, [ebp+GG_G2DISP]
+ | mov CTSTATE, GL:ebp->ctype_state
+ | movzx eax, ax
+ | mov CTSTATE->cb.slot, eax
+ | mov CTSTATE->cb.gpr[0], CARG1
+ | mov CTSTATE->cb.gpr[1], CARG2
+ | mov CTSTATE->cb.gpr[2], CARG3
+ | mov CTSTATE->cb.gpr[3], CARG4
+ | movsd qword CTSTATE->cb.fpr[0], xmm0
+ | movsd qword CTSTATE->cb.fpr[1], xmm1
+ | movsd qword CTSTATE->cb.fpr[2], xmm2
+ | movsd qword CTSTATE->cb.fpr[3], xmm3
+ |.if X64WIN
+ | lea rax, [rsp+CFRAME_SIZE+4*8]
+ |.else
+ | lea rax, [rsp+CFRAME_SIZE]
+ | mov CTSTATE->cb.gpr[4], CARG5
+ | mov CTSTATE->cb.gpr[5], CARG6
+ | movsd qword CTSTATE->cb.fpr[4], xmm4
+ | movsd qword CTSTATE->cb.fpr[5], xmm5
+ | movsd qword CTSTATE->cb.fpr[6], xmm6
+ | movsd qword CTSTATE->cb.fpr[7], xmm7
+ |.endif
+ | mov CTSTATE->cb.stack, rax
+ | mov CARG2, rsp
+ | mov SAVE_PC, CTSTATE // Any value outside of bytecode is ok.
+ | mov CARG1, CTSTATE
+ | call extern lj_ccallback_enter // (CTState *cts, void *cf)
+ | // lua_State * returned in eax (RD).
+ | set_vmstate INTERP
+ | mov BASE, L:RD->base
+ | mov RD, L:RD->top
+ | sub RD, BASE
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | shr RD, 3
+ | add RD, 1
+ | ins_callt
+ |.endif
+ |
+ |->cont_ffi_callback: // Return from FFI callback.
+ |.if FFI
+ | mov L:RA, SAVE_L
+ | mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)]
+ | mov aword CTSTATE->L, L:RA
+ | mov L:RA->base, BASE
+ | mov L:RA->top, RB
+ | mov CARG1, CTSTATE
+ | mov CARG2, RC
+ | call extern lj_ccallback_leave // (CTState *cts, TValue *o)
+ | mov rax, CTSTATE->cb.gpr[0]
+ | movsd xmm0, qword CTSTATE->cb.fpr[0]
+ | jmp ->vm_leave_unw
+ |.endif
+ |
+ |->vm_ffi_call: // Call C function via FFI.
+ | // Caveat: needs special frame unwinding, see below.
+ |.if FFI
+ | .type CCSTATE, CCallState, rbx
+ | push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
+ |
+ | // Readjust stack.
+ | mov eax, CCSTATE->spadj
+ | sub rsp, rax
+ |
+ | // Copy stack slots.
+ | movzx ecx, byte CCSTATE->nsp
+ | sub ecx, 1
+ | js >2
+ |1:
+ | mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
+ | mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
+ | sub ecx, 1
+ | jns <1
+ |2:
+ |
+ | movzx eax, byte CCSTATE->nfpr
+ | mov CARG1, CCSTATE->gpr[0]
+ | mov CARG2, CCSTATE->gpr[1]
+ | mov CARG3, CCSTATE->gpr[2]
+ | mov CARG4, CCSTATE->gpr[3]
+ |.if not X64WIN
+ | mov CARG5, CCSTATE->gpr[4]
+ | mov CARG6, CCSTATE->gpr[5]
+ |.endif
+ | test eax, eax; jz >5
+ | movaps xmm0, CCSTATE->fpr[0]
+ | movaps xmm1, CCSTATE->fpr[1]
+ | movaps xmm2, CCSTATE->fpr[2]
+ | movaps xmm3, CCSTATE->fpr[3]
+ |.if not X64WIN
+ | cmp eax, 4; jbe >5
+ | movaps xmm4, CCSTATE->fpr[4]
+ | movaps xmm5, CCSTATE->fpr[5]
+ | movaps xmm6, CCSTATE->fpr[6]
+ | movaps xmm7, CCSTATE->fpr[7]
+ |.endif
+ |5:
+ |
+ | call aword CCSTATE->func
+ |
+ | mov CCSTATE->gpr[0], rax
+ | movaps CCSTATE->fpr[0], xmm0
+ |.if not X64WIN
+ | mov CCSTATE->gpr[1], rdx
+ | movaps CCSTATE->fpr[1], xmm1
+ |.endif
+ |
+ | mov rbx, [rbp-8]; leave; ret
+ |.endif
+ |// Note: vm_ffi_call must be the last function in this object file!
+ |
+ |//-----------------------------------------------------------------------
+}
+
+/* Generate the code for a single instruction. */
+static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+{
+ int vk = 0;
+ |// Note: aligning all instructions does not pay off.
+ |=>defop:
+
+ switch (op) {
+
+ /* -- Comparison ops ---------------------------------------------------- */
+
+ /* Remember: all ops branch for a true comparison, fall through otherwise. */
+
+ |.macro jmp_comp, lt, ge, le, gt, target
+ ||switch (op) {
+ ||case BC_ISLT:
+ | lt target
+ ||break;
+ ||case BC_ISGE:
+ | ge target
+ ||break;
+ ||case BC_ISLE:
+ | le target
+ ||break;
+ ||case BC_ISGT:
+ | gt target
+ ||break;
+ ||default: break; /* Shut up GCC. */
+ ||}
+ |.endmacro
+
+ case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
+ | // RA = src1, RD = src2, JMP with RD = target
+ | ins_AD
+ | mov ITYPE, [BASE+RA*8]
+ | mov RB, [BASE+RD*8]
+ | mov RA, ITYPE
+ | mov RD, RB
+ | sar ITYPE, 47
+ | sar RB, 47
+ |.if DUALNUM
+ | cmp ITYPEd, LJ_TISNUM; jne >7
+ | cmp RBd, LJ_TISNUM; jne >8
+ | add PC, 4
+ | cmp RAd, RDd
+ | jmp_comp jge, jl, jg, jle, >9
+ |6:
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |9:
+ | ins_next
+ |
+ |7: // RA is not an integer.
+ | ja ->vmeta_comp
+ | // RA is a number.
+ | cmp RBd, LJ_TISNUM; jb >1; jne ->vmeta_comp
+ | // RA is a number, RD is an integer.
+ | cvtsi2sd xmm0, RDd
+ | jmp >2
+ |
+ |8: // RA is an integer, RD is not an integer.
+ | ja ->vmeta_comp
+ | // RA is an integer, RD is a number.
+ | cvtsi2sd xmm1, RAd
+ | movd xmm0, RD
+ | jmp >3
+ |.else
+ | cmp ITYPEd, LJ_TISNUM; jae ->vmeta_comp
+ | cmp RBd, LJ_TISNUM; jae ->vmeta_comp
+ |.endif
+ |1:
+ | movd xmm0, RD
+ |2:
+ | movd xmm1, RA
+ |3:
+ | add PC, 4
+ | ucomisd xmm0, xmm1
+ | // Unordered: all of ZF CF PF set, ordered: PF clear.
+ | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
+ |.if DUALNUM
+ | jmp_comp jbe, ja, jb, jae, <9
+ | jmp <6
+ |.else
+ | jmp_comp jbe, ja, jb, jae, >1
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |1:
+ | ins_next
+ |.endif
+ break;
+
+ case BC_ISEQV: case BC_ISNEV:
+ vk = op == BC_ISEQV;
+ | ins_AD // RA = src1, RD = src2, JMP with RD = target
+ | mov RB, [BASE+RD*8]
+ | mov ITYPE, [BASE+RA*8]
+ | add PC, 4
+ | mov RD, RB
+ | mov RA, ITYPE
+ | sar RB, 47
+ | sar ITYPE, 47
+ |.if DUALNUM
+ | cmp RBd, LJ_TISNUM; jne >7
+ | cmp ITYPEd, LJ_TISNUM; jne >8
+ | cmp RDd, RAd
+ if (vk) {
+ | jne >9
+ } else {
+ | je >9
+ }
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |9:
+ | ins_next
+ |
+ |7: // RD is not an integer.
+ | ja >5
+ | // RD is a number.
+ | movd xmm1, RD
+ | cmp ITYPEd, LJ_TISNUM; jb >1; jne >5
+ | // RD is a number, RA is an integer.
+ | cvtsi2sd xmm0, RAd
+ | jmp >2
+ |
+ |8: // RD is an integer, RA is not an integer.
+ | ja >5
+ | // RD is an integer, RA is a number.
+ | cvtsi2sd xmm1, RDd
+ | jmp >1
+ |
+ |.else
+ | cmp RBd, LJ_TISNUM; jae >5
+ | cmp ITYPEd, LJ_TISNUM; jae >5
+ | movd xmm1, RD
+ |.endif
+ |1:
+ | movd xmm0, RA
+ |2:
+ | ucomisd xmm0, xmm1
+ |4:
+ iseqne_fp:
+ if (vk) {
+ | jp >2 // Unordered means not equal.
+ | jne >2
+ } else {
+ | jp >2 // Unordered means not equal.
+ | je >1
+ }
+ iseqne_end:
+ if (vk) {
+ |1: // EQ: Branch to the target.
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |2: // NE: Fallthrough to next instruction.
+ |.if not FFI
+ |3:
+ |.endif
+ } else {
+ |.if not FFI
+ |3:
+ |.endif
+ |2: // NE: Branch to the target.
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |1: // EQ: Fallthrough to next instruction.
+ }
+ if (LJ_DUALNUM && (op == BC_ISEQV || op == BC_ISNEV ||
+ op == BC_ISEQN || op == BC_ISNEN)) {
+ | jmp <9
+ } else {
+ | ins_next
+ }
+ |
+ if (op == BC_ISEQV || op == BC_ISNEV) {
+ |5: // Either or both types are not numbers.
+ |.if FFI
+ | cmp RBd, LJ_TCDATA; je ->vmeta_equal_cd
+ | cmp ITYPEd, LJ_TCDATA; je ->vmeta_equal_cd
+ |.endif
+ | cmp RA, RD
+ | je <1 // Same GCobjs or pvalues?
+ | cmp RBd, ITYPEd
+ | jne <2 // Not the same type?
+ | cmp RBd, LJ_TISTABUD
+ | ja <2 // Different objects and not table/ud?
+ |
+ | // Different tables or userdatas. Need to check __eq metamethod.
+ | // Field metatable must be at same offset for GCtab and GCudata!
+ | cleartp TAB:RA
+ | mov TAB:RB, TAB:RA->metatable
+ | test TAB:RB, TAB:RB
+ | jz <2 // No metatable?
+ | test byte TAB:RB->nomm, 1<<MM_eq
+ | jnz <2 // Or 'no __eq' flag set?
+ if (vk) {
+ | xor RBd, RBd // ne = 0
+ } else {
+ | mov RBd, 1 // ne = 1
+ }
+ | jmp ->vmeta_equal // Handle __eq metamethod.
+ } else {
+ |.if FFI
+ |3:
+ | cmp ITYPEd, LJ_TCDATA
+ if (LJ_DUALNUM && vk) {
+ | jne <9
+ } else {
+ | jne <2
+ }
+ | jmp ->vmeta_equal_cd
+ |.endif
+ }
+ break;
+ case BC_ISEQS: case BC_ISNES:
+ vk = op == BC_ISEQS;
+ | ins_AND // RA = src, RD = str const, JMP with RD = target
+ | mov RB, [BASE+RA*8]
+ | add PC, 4
+ | checkstr RB, >3
+ | cmp RB, [KBASE+RD*8]
+ iseqne_test:
+ if (vk) {
+ | jne >2
+ } else {
+ | je >1
+ }
+ goto iseqne_end;
+ case BC_ISEQN: case BC_ISNEN:
+ vk = op == BC_ISEQN;
+ | ins_AD // RA = src, RD = num const, JMP with RD = target
+ | mov RB, [BASE+RA*8]
+ | add PC, 4
+ |.if DUALNUM
+ | checkint RB, >7
+ | mov RD, [KBASE+RD*8]
+ | checkint RD, >8
+ | cmp RBd, RDd
+ if (vk) {
+ | jne >9
+ } else {
+ | je >9
+ }
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |9:
+ | ins_next
+ |
+ |7: // RA is not an integer.
+ | ja >3
+ | // RA is a number.
+ | mov RD, [KBASE+RD*8]
+ | checkint RD, >1
+ | // RA is a number, RD is an integer.
+ | cvtsi2sd xmm0, RDd
+ | jmp >2
+ |
+ |8: // RA is an integer, RD is a number.
+ | cvtsi2sd xmm0, RBd
+ | movd xmm1, RD
+ | ucomisd xmm0, xmm1
+ | jmp >4
+ |1:
+ | movd xmm0, RD
+ |.else
+ | checknum RB, >3
+ |1:
+ | movsd xmm0, qword [KBASE+RD*8]
+ |.endif
+ |2:
+ | ucomisd xmm0, qword [BASE+RA*8]
+ |4:
+ goto iseqne_fp;
+ case BC_ISEQP: case BC_ISNEP:
+ vk = op == BC_ISEQP;
+ | ins_AND // RA = src, RD = primitive type (~), JMP with RD = target
+ | mov RB, [BASE+RA*8]
+ | sar RB, 47
+ | add PC, 4
+ | cmp RBd, RDd
+ if (!LJ_HASFFI) goto iseqne_test;
+ if (vk) {
+ | jne >3
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |2:
+ | ins_next
+ |3:
+ | cmp RBd, LJ_TCDATA; jne <2
+ | jmp ->vmeta_equal_cd
+ } else {
+ | je >2
+ | cmp RBd, LJ_TCDATA; je ->vmeta_equal_cd
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |2:
+ | ins_next
+ }
+ break;
+
+ /* -- Unary test and copy ops ------------------------------------------- */
+
+ case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
+ | ins_AD // RA = dst or unused, RD = src, JMP with RD = target
+ | mov ITYPE, [BASE+RD*8]
+ | add PC, 4
+ if (op == BC_ISTC || op == BC_ISFC) {
+ | mov RB, ITYPE
+ }
+ | sar ITYPE, 47
+ | cmp ITYPEd, LJ_TISTRUECOND
+ if (op == BC_IST || op == BC_ISTC) {
+ | jae >1
+ } else {
+ | jb >1
+ }
+ if (op == BC_ISTC || op == BC_ISFC) {
+ | mov [BASE+RA*8], RB
+ }
+ | movzx RDd, PC_RD
+ | branchPC RD
+ |1: // Fallthrough to the next instruction.
+ | ins_next
+ break;
+
+ case BC_ISTYPE:
+ | ins_AD // RA = src, RD = -type
+ | mov RB, [BASE+RA*8]
+ | sar RB, 47
+ | add RBd, RDd
+ | jne ->vmeta_istype
+ | ins_next
+ break;
+ case BC_ISNUM:
+ | ins_AD // RA = src, RD = -(TISNUM-1)
+ | checknumtp [BASE+RA*8], ->vmeta_istype
+ | ins_next
+ break;
+
+ /* -- Unary ops --------------------------------------------------------- */
+
+ case BC_MOV:
+ | ins_AD // RA = dst, RD = src
+ | mov RB, [BASE+RD*8]
+ | mov [BASE+RA*8], RB
+ | ins_next_
+ break;
+ case BC_NOT:
+ | ins_AD // RA = dst, RD = src
+ | mov RB, [BASE+RD*8]
+ | sar RB, 47
+ | mov RCd, 2
+ | cmp RB, LJ_TISTRUECOND
+ | sbb RCd, 0
+ | shl RC, 47
+ | not RC
+ | mov [BASE+RA*8], RC
+ | ins_next
+ break;
+ case BC_UNM:
+ | ins_AD // RA = dst, RD = src
+ | mov RB, [BASE+RD*8]
+ |.if DUALNUM
+ | checkint RB, >5
+ | neg RBd
+ | jo >4
+ | setint RB
+ |9:
+ | mov [BASE+RA*8], RB
+ | ins_next
+ |4:
+ | mov64 RB, U64x(41e00000,00000000) // 2^31.
+ | jmp <9
+ |5:
+ | ja ->vmeta_unm
+ |.else
+ | checknum RB, ->vmeta_unm
+ |.endif
+ | mov64 RD, U64x(80000000,00000000)
+ | xor RB, RD
+ |.if DUALNUM
+ | jmp <9
+ |.else
+ | mov [BASE+RA*8], RB
+ | ins_next
+ |.endif
+ break;
+ case BC_LEN:
+ | ins_AD // RA = dst, RD = src
+ | mov RD, [BASE+RD*8]
+ | checkstr RD, >2
+ |.if DUALNUM
+ | mov RDd, dword STR:RD->len
+ |1:
+ | setint RD
+ | mov [BASE+RA*8], RD
+ |.else
+ | xorps xmm0, xmm0
+ | cvtsi2sd xmm0, dword STR:RD->len
+ |1:
+ | movsd qword [BASE+RA*8], xmm0
+ |.endif
+ | ins_next
+ |2:
+ | cmp ITYPEd, LJ_TTAB; jne ->vmeta_len
+ | mov TAB:CARG1, TAB:RD
+#if LJ_52
+ | mov TAB:RB, TAB:RD->metatable
+ | cmp TAB:RB, 0
+ | jnz >9
+ |3:
+#endif
+ |->BC_LEN_Z:
+ | mov RB, BASE // Save BASE.
+ | call extern lj_tab_len // (GCtab *t)
+ | // Length of table returned in eax (RD).
+ |.if DUALNUM
+ | // Nothing to do.
+ |.else
+ | cvtsi2sd xmm0, RDd
+ |.endif
+ | mov BASE, RB // Restore BASE.
+ | movzx RAd, PC_RA
+ | jmp <1
+#if LJ_52
+ |9: // Check for __len.
+ | test byte TAB:RB->nomm, 1<<MM_len
+ | jnz <3
+ | jmp ->vmeta_len // 'no __len' flag NOT set: check.
+#endif
+ break;
+
+ /* -- Binary ops -------------------------------------------------------- */
+
+ |.macro ins_arithpre, sseins, ssereg
+ | ins_ABC
+ ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+ ||switch (vk) {
+ ||case 0:
+ | checknumtp [BASE+RB*8], ->vmeta_arith_vn
+ | .if DUALNUM
+ | checknumtp [KBASE+RC*8], ->vmeta_arith_vn
+ | .endif
+ | movsd xmm0, qword [BASE+RB*8]
+ | sseins ssereg, qword [KBASE+RC*8]
+ || break;
+ ||case 1:
+ | checknumtp [BASE+RB*8], ->vmeta_arith_nv
+ | .if DUALNUM
+ | checknumtp [KBASE+RC*8], ->vmeta_arith_nv
+ | .endif
+ | movsd xmm0, qword [KBASE+RC*8]
+ | sseins ssereg, qword [BASE+RB*8]
+ || break;
+ ||default:
+ | checknumtp [BASE+RB*8], ->vmeta_arith_vv
+ | checknumtp [BASE+RC*8], ->vmeta_arith_vv
+ | movsd xmm0, qword [BASE+RB*8]
+ | sseins ssereg, qword [BASE+RC*8]
+ || break;
+ ||}
+ |.endmacro
+ |
+ |.macro ins_arithdn, intins
+ | ins_ABC
+ ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+ ||switch (vk) {
+ ||case 0:
+ | mov RB, [BASE+RB*8]
+ | mov RC, [KBASE+RC*8]
+ | checkint RB, ->vmeta_arith_vno
+ | checkint RC, ->vmeta_arith_vno
+ | intins RBd, RCd; jo ->vmeta_arith_vno
+ || break;
+ ||case 1:
+ | mov RB, [BASE+RB*8]
+ | mov RC, [KBASE+RC*8]
+ | checkint RB, ->vmeta_arith_nvo
+ | checkint RC, ->vmeta_arith_nvo
+ | intins RCd, RBd; jo ->vmeta_arith_nvo
+ || break;
+ ||default:
+ | mov RB, [BASE+RB*8]
+ | mov RC, [BASE+RC*8]
+ | checkint RB, ->vmeta_arith_vvo
+ | checkint RC, ->vmeta_arith_vvo
+ | intins RBd, RCd; jo ->vmeta_arith_vvo
+ || break;
+ ||}
+ ||if (vk == 1) {
+ | setint RC
+ | mov [BASE+RA*8], RC
+ ||} else {
+ | setint RB
+ | mov [BASE+RA*8], RB
+ ||}
+ | ins_next
+ |.endmacro
+ |
+ |.macro ins_arithpost
+ | movsd qword [BASE+RA*8], xmm0
+ |.endmacro
+ |
+ |.macro ins_arith, sseins
+ | ins_arithpre sseins, xmm0
+ | ins_arithpost
+ | ins_next
+ |.endmacro
+ |
+ |.macro ins_arith, intins, sseins
+ |.if DUALNUM
+ | ins_arithdn intins
+ |.else
+ | ins_arith, sseins
+ |.endif
+ |.endmacro
+
+ | // RA = dst, RB = src1 or num const, RC = src2 or num const
+ case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
+ | ins_arith add, addsd
+ break;
+ case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
+ | ins_arith sub, subsd
+ break;
+ case BC_MULVN: case BC_MULNV: case BC_MULVV:
+ | ins_arith imul, mulsd
+ break;
+ case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
+ | ins_arith divsd
+ break;
+ case BC_MODVN:
+ | ins_arithpre movsd, xmm1
+ |->BC_MODVN_Z:
+ | call ->vm_mod
+ | ins_arithpost
+ | ins_next
+ break;
+ case BC_MODNV: case BC_MODVV:
+ | ins_arithpre movsd, xmm1
+ | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
+ break;
+ case BC_POW:
+ | ins_arithpre movsd, xmm1
+ | mov RB, BASE
+ | call extern pow
+ | movzx RAd, PC_RA
+ | mov BASE, RB
+ | ins_arithpost
+ | ins_next
+ break;
+
+ case BC_CAT:
+ | ins_ABC // RA = dst, RB = src_start, RC = src_end
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE
+ | lea CARG2, [BASE+RC*8]
+ | mov CARG3d, RCd
+ | sub CARG3d, RBd
+ |->BC_CAT_Z:
+ | mov L:RB, L:CARG1
+ | mov SAVE_PC, PC
+ | call extern lj_meta_cat // (lua_State *L, TValue *top, int left)
+ | // NULL (finished) or TValue * (metamethod) returned in eax (RC).
+ | mov BASE, L:RB->base
+ | test RC, RC
+ | jnz ->vmeta_binop
+ | movzx RBd, PC_RB // Copy result to Stk[RA] from Stk[RB].
+ | movzx RAd, PC_RA
+ | mov RC, [BASE+RB*8]
+ | mov [BASE+RA*8], RC
+ | ins_next
+ break;
+
+ /* -- Constant ops ------------------------------------------------------ */
+
+ case BC_KSTR:
+ | ins_AND // RA = dst, RD = str const (~)
+ | mov RD, [KBASE+RD*8]
+ | settp RD, LJ_TSTR
+ | mov [BASE+RA*8], RD
+ | ins_next
+ break;
+ case BC_KCDATA:
+ |.if FFI
+ | ins_AND // RA = dst, RD = cdata const (~)
+ | mov RD, [KBASE+RD*8]
+ | settp RD, LJ_TCDATA
+ | mov [BASE+RA*8], RD
+ | ins_next
+ |.endif
+ break;
+ case BC_KSHORT:
+ | ins_AD // RA = dst, RD = signed int16 literal
+ |.if DUALNUM
+ | movsx RDd, RDW
+ | setint RD
+ | mov [BASE+RA*8], RD
+ |.else
+ | movsx RDd, RDW // Sign-extend literal.
+ | cvtsi2sd xmm0, RDd
+ | movsd qword [BASE+RA*8], xmm0
+ |.endif
+ | ins_next
+ break;
+ case BC_KNUM:
+ | ins_AD // RA = dst, RD = num const
+ | movsd xmm0, qword [KBASE+RD*8]
+ | movsd qword [BASE+RA*8], xmm0
+ | ins_next
+ break;
+ case BC_KPRI:
+ | ins_AD // RA = dst, RD = primitive type (~)
+ | shl RD, 47
+ | not RD
+ | mov [BASE+RA*8], RD
+ | ins_next
+ break;
+ case BC_KNIL:
+ | ins_AD // RA = dst_start, RD = dst_end
+ | lea RA, [BASE+RA*8+8]
+ | lea RD, [BASE+RD*8]
+ | mov RB, LJ_TNIL
+ | mov [RA-8], RB // Sets minimum 2 slots.
+ |1:
+ | mov [RA], RB
+ | add RA, 8
+ | cmp RA, RD
+ | jbe <1
+ | ins_next
+ break;
+
+ /* -- Upvalue and function ops ------------------------------------------ */
+
+ case BC_UGET:
+ | ins_AD // RA = dst, RD = upvalue #
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | mov UPVAL:RB, [LFUNC:RB+RD*8+offsetof(GCfuncL, uvptr)]
+ | mov RB, UPVAL:RB->v
+ | mov RD, [RB]
+ | mov [BASE+RA*8], RD
+ | ins_next
+ break;
+ case BC_USETV:
+#define TV2MARKOFS \
+ ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv))
+ | ins_AD // RA = upvalue #, RD = src
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
+ | cmp byte UPVAL:RB->closed, 0
+ | mov RB, UPVAL:RB->v
+ | mov RA, [BASE+RD*8]
+ | mov [RB], RA
+ | jz >1
+ | // Check barrier for closed upvalue.
+ | test byte [RB+TV2MARKOFS], LJ_GC_BLACK // isblack(uv)
+ | jnz >2
+ |1:
+ | ins_next
+ |
+ |2: // Upvalue is black. Check if new value is collectable and white.
+ | mov RD, RA
+ | sar RD, 47
+ | sub RDd, LJ_TISGCV
+ | cmp RDd, LJ_TNUMX - LJ_TISGCV // tvisgcv(v)
+ | jbe <1
+ | cleartp GCOBJ:RA
+ | test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES // iswhite(v)
+ | jz <1
+ | // Crossed a write barrier. Move the barrier forward.
+ |.if not X64WIN
+ | mov CARG2, RB
+ | mov RB, BASE // Save BASE.
+ |.else
+ | xchg CARG2, RB // Save BASE (CARG2 == BASE).
+ |.endif
+ | lea GL:CARG1, [DISPATCH+GG_DISP2G]
+ | call extern lj_gc_barrieruv // (global_State *g, TValue *tv)
+ | mov BASE, RB // Restore BASE.
+ | jmp <1
+ break;
+#undef TV2MARKOFS
+ case BC_USETS:
+ | ins_AND // RA = upvalue #, RD = str const (~)
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
+ | mov STR:RA, [KBASE+RD*8]
+ | mov RD, UPVAL:RB->v
+ | settp STR:ITYPE, STR:RA, LJ_TSTR
+ | mov [RD], STR:ITYPE
+ | test byte UPVAL:RB->marked, LJ_GC_BLACK // isblack(uv)
+ | jnz >2
+ |1:
+ | ins_next
+ |
+ |2: // Check if string is white and ensure upvalue is closed.
+ | test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES // iswhite(str)
+ | jz <1
+ | cmp byte UPVAL:RB->closed, 0
+ | jz <1
+ | // Crossed a write barrier. Move the barrier forward.
+ | mov RB, BASE // Save BASE (CARG2 == BASE).
+ | mov CARG2, RD
+ | lea GL:CARG1, [DISPATCH+GG_DISP2G]
+ | call extern lj_gc_barrieruv // (global_State *g, TValue *tv)
+ | mov BASE, RB // Restore BASE.
+ | jmp <1
+ break;
+ case BC_USETN:
+ | ins_AD // RA = upvalue #, RD = num const
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | movsd xmm0, qword [KBASE+RD*8]
+ | mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
+ | mov RA, UPVAL:RB->v
+ | movsd qword [RA], xmm0
+ | ins_next
+ break;
+ case BC_USETP:
+ | ins_AD // RA = upvalue #, RD = primitive type (~)
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
+ | shl RD, 47
+ | not RD
+ | mov RA, UPVAL:RB->v
+ | mov [RA], RD
+ | ins_next
+ break;
+ case BC_UCLO:
+ | ins_AD // RA = level, RD = target
+ | branchPC RD // Do this first to free RD.
+ | mov L:RB, SAVE_L
+ | cmp dword L:RB->openupval, 0
+ | je >1
+ | mov L:RB->base, BASE
+ | lea CARG2, [BASE+RA*8] // Caveat: CARG2 == BASE
+ | mov L:CARG1, L:RB // Caveat: CARG1 == RA
+ | call extern lj_func_closeuv // (lua_State *L, TValue *level)
+ | mov BASE, L:RB->base
+ |1:
+ | ins_next
+ break;
+
+ case BC_FNEW:
+ | ins_AND // RA = dst, RD = proto const (~) (holding function prototype)
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE // Caveat: CARG2/CARG3 may be BASE.
+ | mov CARG3, [BASE-16]
+ | cleartp CARG3
+ | mov CARG2, [KBASE+RD*8] // Fetch GCproto *.
+ | mov CARG1, L:RB
+ | mov SAVE_PC, PC
+ | // (lua_State *L, GCproto *pt, GCfuncL *parent)
+ | call extern lj_func_newL_gc
+ | // GCfuncL * returned in eax (RC).
+ | mov BASE, L:RB->base
+ | movzx RAd, PC_RA
+ | settp LFUNC:RC, LJ_TFUNC
+ | mov [BASE+RA*8], LFUNC:RC
+ | ins_next
+ break;
+
+ /* -- Table ops --------------------------------------------------------- */
+
+ case BC_TNEW:
+ | ins_AD // RA = dst, RD = hbits|asize
+ | mov L:RB, SAVE_L
+ | mov L:RB->base, BASE
+ | mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
+ | cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
+ | mov SAVE_PC, PC
+ | jae >5
+ |1:
+ | mov CARG3d, RDd
+ | and RDd, 0x7ff
+ | shr CARG3d, 11
+ | cmp RDd, 0x7ff
+ | je >3
+ |2:
+ | mov L:CARG1, L:RB
+ | mov CARG2d, RDd
+ | call extern lj_tab_new // (lua_State *L, int32_t asize, uint32_t hbits)
+ | // Table * returned in eax (RC).
+ | mov BASE, L:RB->base
+ | movzx RAd, PC_RA
+ | settp TAB:RC, LJ_TTAB
+ | mov [BASE+RA*8], TAB:RC
+ | ins_next
+ |3: // Turn 0x7ff into 0x801.
+ | mov RDd, 0x801
+ | jmp <2
+ |5:
+ | mov L:CARG1, L:RB
+ | call extern lj_gc_step_fixtop // (lua_State *L)
+ | movzx RDd, PC_RD
+ | jmp <1
+ break;
+ case BC_TDUP:
+ | ins_AND // RA = dst, RD = table const (~) (holding template table)
+ | mov L:RB, SAVE_L
+ | mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
+ | mov SAVE_PC, PC
+ | cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
+ | mov L:RB->base, BASE
+ | jae >3
+ |2:
+ | mov TAB:CARG2, [KBASE+RD*8] // Caveat: CARG2 == BASE
+ | mov L:CARG1, L:RB // Caveat: CARG1 == RA
+ | call extern lj_tab_dup // (lua_State *L, Table *kt)
+ | // Table * returned in eax (RC).
+ | mov BASE, L:RB->base
+ | movzx RAd, PC_RA
+ | settp TAB:RC, LJ_TTAB
+ | mov [BASE+RA*8], TAB:RC
+ | ins_next
+ |3:
+ | mov L:CARG1, L:RB
+ | call extern lj_gc_step_fixtop // (lua_State *L)
+ | movzx RDd, PC_RD // Need to reload RD.
+ | not RD
+ | jmp <2
+ break;
+
+ case BC_GGET:
+ | ins_AND // RA = dst, RD = str const (~)
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | mov TAB:RB, LFUNC:RB->env
+ | mov STR:RC, [KBASE+RD*8]
+ | jmp ->BC_TGETS_Z
+ break;
+ case BC_GSET:
+ | ins_AND // RA = src, RD = str const (~)
+ | mov LFUNC:RB, [BASE-16]
+ | cleartp LFUNC:RB
+ | mov TAB:RB, LFUNC:RB->env
+ | mov STR:RC, [KBASE+RD*8]
+ | jmp ->BC_TSETS_Z
+ break;
+
+ case BC_TGETV:
+ | ins_ABC // RA = dst, RB = table, RC = key
+ | mov TAB:RB, [BASE+RB*8]
+ | mov RC, [BASE+RC*8]
+ | checktab TAB:RB, ->vmeta_tgetv
+ |
+ | // Integer key?
+ |.if DUALNUM
+ | checkint RC, >5
+ |.else
+ | // Convert number to int and back and compare.
+ | checknum RC, >5
+ | movd xmm0, RC
+ | cvttsd2si RCd, xmm0
+ | cvtsi2sd xmm1, RCd
+ | ucomisd xmm0, xmm1
+ | jne ->vmeta_tgetv // Generic numeric key? Use fallback.
+ |.endif
+ | cmp RCd, TAB:RB->asize // Takes care of unordered, too.
+ | jae ->vmeta_tgetv // Not in array part? Use fallback.
+ | shl RCd, 3
+ | add RC, TAB:RB->array
+ | // Get array slot.
+ | mov ITYPE, [RC]
+ | cmp ITYPE, LJ_TNIL // Avoid overwriting RB in fastpath.
+ | je >2
+ |1:
+ | mov [BASE+RA*8], ITYPE
+ | ins_next
+ |
+ |2: // Check for __index if table value is nil.
+ | mov TAB:TMPR, TAB:RB->metatable
+ | test TAB:TMPR, TAB:TMPR
+ | jz <1
+ | test byte TAB:TMPR->nomm, 1<<MM_index
+ | jz ->vmeta_tgetv // 'no __index' flag NOT set: check.
+ | jmp <1
+ |
+ |5: // String key?
+ | cmp ITYPEd, LJ_TSTR; jne ->vmeta_tgetv
+ | cleartp STR:RC
+ | jmp ->BC_TGETS_Z
+ break;
+ case BC_TGETS:
+ | ins_ABC // RA = dst, RB = table, RC = str const (~)
+ | mov TAB:RB, [BASE+RB*8]
+ | not RC
+ | mov STR:RC, [KBASE+RC*8]
+ | checktab TAB:RB, ->vmeta_tgets
+ |->BC_TGETS_Z: // RB = GCtab *, RC = GCstr *
+ | mov TMPRd, TAB:RB->hmask
+ | and TMPRd, STR:RC->hash
+ | imul TMPRd, #NODE
+ | add NODE:TMPR, TAB:RB->node
+ | settp ITYPE, STR:RC, LJ_TSTR
+ |1:
+ | cmp NODE:TMPR->key, ITYPE
+ | jne >4
+ | // Get node value.
+ | mov ITYPE, NODE:TMPR->val
+ | cmp ITYPE, LJ_TNIL
+ | je >5 // Key found, but nil value?
+ |2:
+ | mov [BASE+RA*8], ITYPE
+ | ins_next
+ |
+ |4: // Follow hash chain.
+ | mov NODE:TMPR, NODE:TMPR->next
+ | test NODE:TMPR, NODE:TMPR
+ | jnz <1
+ | // End of hash chain: key not found, nil result.
+ | mov ITYPE, LJ_TNIL
+ |
+ |5: // Check for __index if table value is nil.
+ | mov TAB:TMPR, TAB:RB->metatable
+ | test TAB:TMPR, TAB:TMPR
+ | jz <2 // No metatable: done.
+ | test byte TAB:TMPR->nomm, 1<<MM_index
+ | jnz <2 // 'no __index' flag set: done.
+ | jmp ->vmeta_tgets // Caveat: preserve STR:RC.
+ break;
+ case BC_TGETB:
+ | ins_ABC // RA = dst, RB = table, RC = byte literal
+ | mov TAB:RB, [BASE+RB*8]
+ | checktab TAB:RB, ->vmeta_tgetb
+ | cmp RCd, TAB:RB->asize
+ | jae ->vmeta_tgetb
+ | shl RCd, 3
+ | add RC, TAB:RB->array
+ | // Get array slot.
+ | mov ITYPE, [RC]
+ | cmp ITYPE, LJ_TNIL
+ | je >2
+ |1:
+ | mov [BASE+RA*8], ITYPE
+ | ins_next
+ |
+ |2: // Check for __index if table value is nil.
+ | mov TAB:TMPR, TAB:RB->metatable
+ | test TAB:TMPR, TAB:TMPR
+ | jz <1
+ | test byte TAB:TMPR->nomm, 1<<MM_index
+ | jz ->vmeta_tgetb // 'no __index' flag NOT set: check.
+ | jmp <1
+ break;
+ case BC_TGETR:
+ | ins_ABC // RA = dst, RB = table, RC = key
+ | mov TAB:RB, [BASE+RB*8]
+ | cleartp TAB:RB
+ |.if DUALNUM
+ | mov RCd, dword [BASE+RC*8]
+ |.else
+ | cvttsd2si RCd, qword [BASE+RC*8]
+ |.endif
+ | cmp RCd, TAB:RB->asize
+ | jae ->vmeta_tgetr // Not in array part? Use fallback.
+ | shl RCd, 3
+ | add RC, TAB:RB->array
+ | // Get array slot.
+ |->BC_TGETR_Z:
+ | mov ITYPE, [RC]
+ |->BC_TGETR2_Z:
+ | mov [BASE+RA*8], ITYPE
+ | ins_next
+ break;
+
+ case BC_TSETV:
+ | ins_ABC // RA = src, RB = table, RC = key
+ | mov TAB:RB, [BASE+RB*8]
+ | mov RC, [BASE+RC*8]
+ | checktab TAB:RB, ->vmeta_tsetv
+ |
+ | // Integer key?
+ |.if DUALNUM
+ | checkint RC, >5
+ |.else
+ | // Convert number to int and back and compare.
+ | checknum RC, >5
+ | movd xmm0, RC
+ | cvttsd2si RCd, xmm0
+ | cvtsi2sd xmm1, RCd
+ | ucomisd xmm0, xmm1
+ | jne ->vmeta_tsetv // Generic numeric key? Use fallback.
+ |.endif
+ | cmp RCd, TAB:RB->asize // Takes care of unordered, too.
+ | jae ->vmeta_tsetv
+ | shl RCd, 3
+ | add RC, TAB:RB->array
+ | cmp aword [RC], LJ_TNIL
+ | je >3 // Previous value is nil?
+ |1:
+ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
+ | jnz >7
+ |2: // Set array slot.
+ | mov RB, [BASE+RA*8]
+ | mov [RC], RB
+ | ins_next
+ |
+ |3: // Check for __newindex if previous value is nil.
+ | mov TAB:TMPR, TAB:RB->metatable
+ | test TAB:TMPR, TAB:TMPR
+ | jz <1
+ | test byte TAB:TMPR->nomm, 1<<MM_newindex
+ | jz ->vmeta_tsetv // 'no __newindex' flag NOT set: check.
+ | jmp <1
+ |
+ |5: // String key?
+ | cmp ITYPEd, LJ_TSTR; jne ->vmeta_tsetv
+ | cleartp STR:RC
+ | jmp ->BC_TSETS_Z
+ |
+ |7: // Possible table write barrier for the value. Skip valiswhite check.
+ | barrierback TAB:RB, TMPR
+ | jmp <2
+ break;
+ case BC_TSETS:
+ | ins_ABC // RA = src, RB = table, RC = str const (~)
+ | mov TAB:RB, [BASE+RB*8]
+ | not RC
+ | mov STR:RC, [KBASE+RC*8]
+ | checktab TAB:RB, ->vmeta_tsets
+ |->BC_TSETS_Z: // RB = GCtab *, RC = GCstr *
+ | mov TMPRd, TAB:RB->hmask
+ | and TMPRd, STR:RC->hash
+ | imul TMPRd, #NODE
+ | mov byte TAB:RB->nomm, 0 // Clear metamethod cache.
+ | add NODE:TMPR, TAB:RB->node
+ | settp ITYPE, STR:RC, LJ_TSTR
+ |1:
+ | cmp NODE:TMPR->key, ITYPE
+ | jne >5
+ | // Ok, key found. Assumes: offsetof(Node, val) == 0
+ | cmp aword [TMPR], LJ_TNIL
+ | je >4 // Previous value is nil?
+ |2:
+ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
+ | jnz >7
+ |3: // Set node value.
+ | mov ITYPE, [BASE+RA*8]
+ | mov [TMPR], ITYPE
+ | ins_next
+ |
+ |4: // Check for __newindex if previous value is nil.
+ | mov TAB:ITYPE, TAB:RB->metatable
+ | test TAB:ITYPE, TAB:ITYPE
+ | jz <2
+ | test byte TAB:ITYPE->nomm, 1<<MM_newindex
+ | jz ->vmeta_tsets // 'no __newindex' flag NOT set: check.
+ | jmp <2
+ |
+ |5: // Follow hash chain.
+ | mov NODE:TMPR, NODE:TMPR->next
+ | test NODE:TMPR, NODE:TMPR
+ | jnz <1
+ | // End of hash chain: key not found, add a new one.
+ |
+ | // But check for __newindex first.
+ | mov TAB:TMPR, TAB:RB->metatable
+ | test TAB:TMPR, TAB:TMPR
+ | jz >6 // No metatable: continue.
+ | test byte TAB:TMPR->nomm, 1<<MM_newindex
+ | jz ->vmeta_tsets // 'no __newindex' flag NOT set: check.
+ |6:
+ | mov TMP1, ITYPE
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE
+ | lea CARG3, TMP1
+ | mov CARG2, TAB:RB
+ | mov SAVE_PC, PC
+ | call extern lj_tab_newkey // (lua_State *L, GCtab *t, TValue *k)
+ | // Handles write barrier for the new key. TValue * returned in eax (RC).
+ | mov L:CARG1, SAVE_L
+ | mov BASE, L:CARG1->base
+ | mov TMPR, rax
+ | movzx RAd, PC_RA
+ | jmp <2 // Must check write barrier for value.
+ |
+ |7: // Possible table write barrier for the value. Skip valiswhite check.
+ | barrierback TAB:RB, ITYPE
+ | jmp <3
+ break;
+ case BC_TSETB:
+ | ins_ABC // RA = src, RB = table, RC = byte literal
+ | mov TAB:RB, [BASE+RB*8]
+ | checktab TAB:RB, ->vmeta_tsetb
+ | cmp RCd, TAB:RB->asize
+ | jae ->vmeta_tsetb
+ | shl RCd, 3
+ | add RC, TAB:RB->array
+ | cmp aword [RC], LJ_TNIL
+ | je >3 // Previous value is nil?
+ |1:
+ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
+ | jnz >7
+ |2: // Set array slot.
+ | mov ITYPE, [BASE+RA*8]
+ | mov [RC], ITYPE
+ | ins_next
+ |
+ |3: // Check for __newindex if previous value is nil.
+ | mov TAB:TMPR, TAB:RB->metatable
+ | test TAB:TMPR, TAB:TMPR
+ | jz <1
+ | test byte TAB:TMPR->nomm, 1<<MM_newindex
+ | jz ->vmeta_tsetb // 'no __newindex' flag NOT set: check.
+ | jmp <1
+ |
+ |7: // Possible table write barrier for the value. Skip valiswhite check.
+ | barrierback TAB:RB, TMPR
+ | jmp <2
+ break;
+ case BC_TSETR:
+ | ins_ABC // RA = src, RB = table, RC = key
+ | mov TAB:RB, [BASE+RB*8]
+ | cleartp TAB:RB
+ |.if DUALNUM
+ | mov RC, [BASE+RC*8]
+ |.else
+ | cvttsd2si RCd, qword [BASE+RC*8]
+ |.endif
+ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
+ | jnz >7
+ |2:
+ | cmp RCd, TAB:RB->asize
+ | jae ->vmeta_tsetr
+ | shl RCd, 3
+ | add RC, TAB:RB->array
+ | // Set array slot.
+ |->BC_TSETR_Z:
+ | mov ITYPE, [BASE+RA*8]
+ | mov [RC], ITYPE
+ | ins_next
+ |
+ |7: // Possible table write barrier for the value. Skip valiswhite check.
+ | barrierback TAB:RB, TMPR
+ | jmp <2
+ break;
+
+ case BC_TSETM:
+ | ins_AD // RA = base (table at base-1), RD = num const (start index)
+ |1:
+ | mov TMPRd, dword [KBASE+RD*8] // Integer constant is in lo-word.
+ | lea RA, [BASE+RA*8]
+ | mov TAB:RB, [RA-8] // Guaranteed to be a table.
+ | cleartp TAB:RB
+ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
+ | jnz >7
+ |2:
+ | mov RDd, MULTRES
+ | sub RDd, 1
+ | jz >4 // Nothing to copy?
+ | add RDd, TMPRd // Compute needed size.
+ | cmp RDd, TAB:RB->asize
+ | ja >5 // Doesn't fit into array part?
+ | sub RDd, TMPRd
+ | shl TMPRd, 3
+ | add TMPR, TAB:RB->array
+ |3: // Copy result slots to table.
+ | mov RB, [RA]
+ | add RA, 8
+ | mov [TMPR], RB
+ | add TMPR, 8
+ | sub RDd, 1
+ | jnz <3
+ |4:
+ | ins_next
+ |
+ |5: // Need to resize array part.
+ | mov L:CARG1, SAVE_L
+ | mov L:CARG1->base, BASE // Caveat: CARG2/CARG3 may be BASE.
+ | mov CARG2, TAB:RB
+ | mov CARG3d, RDd
+ | mov L:RB, L:CARG1
+ | mov SAVE_PC, PC
+ | call extern lj_tab_reasize // (lua_State *L, GCtab *t, int nasize)
+ | mov BASE, L:RB->base
+ | movzx RAd, PC_RA // Restore RA.
+ | movzx RDd, PC_RD // Restore RD.
+ | jmp <1 // Retry.
+ |
+ |7: // Possible table write barrier for any value. Skip valiswhite check.
+ | barrierback TAB:RB, RD
+ | jmp <2
+ break;
+
+ /* -- Calls and vararg handling ----------------------------------------- */
+
+ case BC_CALL: case BC_CALLM:
+ | ins_A_C // RA = base, (RB = nresults+1,) RC = nargs+1 | extra_nargs
+ if (op == BC_CALLM) {
+ | add NARGS:RDd, MULTRES
+ }
+ | mov LFUNC:RB, [BASE+RA*8]
+ | checkfunc LFUNC:RB, ->vmeta_call_ra
+ | lea BASE, [BASE+RA*8+16]
+ | ins_call
+ break;
+
+ case BC_CALLMT:
+ | ins_AD // RA = base, RD = extra_nargs
+ | add NARGS:RDd, MULTRES
+ | // Fall through. Assumes BC_CALLT follows and ins_AD is a no-op.
+ break;
+ case BC_CALLT:
+ | ins_AD // RA = base, RD = nargs+1
+ | lea RA, [BASE+RA*8+16]
+ | mov KBASE, BASE // Use KBASE for move + vmeta_call hint.
+ | mov LFUNC:RB, [RA-16]
+ | checktp_nc LFUNC:RB, LJ_TFUNC, ->vmeta_call
+ |->BC_CALLT_Z:
+ | mov PC, [BASE-8]
+ | test PCd, FRAME_TYPE
+ | jnz >7
+ |1:
+ | mov [BASE-16], LFUNC:RB // Copy func+tag down, reloaded below.
+ | mov MULTRES, NARGS:RDd
+ | sub NARGS:RDd, 1
+ | jz >3
+ |2: // Move args down.
+ | mov RB, [RA]
+ | add RA, 8
+ | mov [KBASE], RB
+ | add KBASE, 8
+ | sub NARGS:RDd, 1
+ | jnz <2
+ |
+ | mov LFUNC:RB, [BASE-16]
+ |3:
+ | cleartp LFUNC:RB
+ | mov NARGS:RDd, MULTRES
+ | cmp byte LFUNC:RB->ffid, 1 // (> FF_C) Calling a fast function?
+ | ja >5
+ |4:
+ | ins_callt
+ |
+ |5: // Tailcall to a fast function.
+ | test PCd, FRAME_TYPE // Lua frame below?
+ | jnz <4
+ | movzx RAd, PC_RA
+ | neg RA
+ | mov LFUNC:KBASE, [BASE+RA*8-32] // Need to prepare KBASE.
+ | cleartp LFUNC:KBASE
+ | mov KBASE, LFUNC:KBASE->pc
+ | mov KBASE, [KBASE+PC2PROTO(k)]
+ | jmp <4
+ |
+ |7: // Tailcall from a vararg function.
+ | sub PC, FRAME_VARG
+ | test PCd, FRAME_TYPEP
+ | jnz >8 // Vararg frame below?
+ | sub BASE, PC // Need to relocate BASE/KBASE down.
+ | mov KBASE, BASE
+ | mov PC, [BASE-8]
+ | jmp <1
+ |8:
+ | add PCd, FRAME_VARG
+ | jmp <1
+ break;
+
+ case BC_ITERC:
+ | ins_A // RA = base, (RB = nresults+1,) RC = nargs+1 (2+1)
+ | lea RA, [BASE+RA*8+16] // fb = base+2
+ | mov RB, [RA-32] // Copy state. fb[0] = fb[-4].
+ | mov RC, [RA-24] // Copy control var. fb[1] = fb[-3].
+ | mov [RA], RB
+ | mov [RA+8], RC
+ | mov LFUNC:RB, [RA-40] // Copy callable. fb[-1] = fb[-5]
+ | mov [RA-16], LFUNC:RB
+ | mov NARGS:RDd, 2+1 // Handle like a regular 2-arg call.
+ | checkfunc LFUNC:RB, ->vmeta_call
+ | mov BASE, RA
+ | ins_call
+ break;
+
+ case BC_ITERN:
+ | ins_A // RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+ |.if JIT
+ | // NYI: add hotloop, record BC_ITERN.
+ |.endif
+ | mov TAB:RB, [BASE+RA*8-16]
+ | cleartp TAB:RB
+ | mov RCd, [BASE+RA*8-8] // Get index from control var.
+ | mov TMPRd, TAB:RB->asize
+ | add PC, 4
+ | mov ITYPE, TAB:RB->array
+ |1: // Traverse array part.
+ | cmp RCd, TMPRd; jae >5 // Index points after array part?
+ | cmp aword [ITYPE+RC*8], LJ_TNIL; je >4
+ |.if not DUALNUM
+ | cvtsi2sd xmm0, RCd
+ |.endif
+ | // Copy array slot to returned value.
+ | mov RB, [ITYPE+RC*8]
+ | mov [BASE+RA*8+8], RB
+ | // Return array index as a numeric key.
+ |.if DUALNUM
+ | setint ITYPE, RC
+ | mov [BASE+RA*8], ITYPE
+ |.else
+ | movsd qword [BASE+RA*8], xmm0
+ |.endif
+ | add RCd, 1
+ | mov [BASE+RA*8-8], RCd // Update control var.
+ |2:
+ | movzx RDd, PC_RD // Get target from ITERL.
+ | branchPC RD
+ |3:
+ | ins_next
+ |
+ |4: // Skip holes in array part.
+ | add RCd, 1
+ | jmp <1
+ |
+ |5: // Traverse hash part.
+ | sub RCd, TMPRd
+ |6:
+ | cmp RCd, TAB:RB->hmask; ja <3 // End of iteration? Branch to ITERL+1.
+ | imul ITYPEd, RCd, #NODE
+ | add NODE:ITYPE, TAB:RB->node
+ | cmp aword NODE:ITYPE->val, LJ_TNIL; je >7
+ | lea TMPRd, [RCd+TMPRd+1]
+ | // Copy key and value from hash slot.
+ | mov RB, NODE:ITYPE->key
+ | mov RC, NODE:ITYPE->val
+ | mov [BASE+RA*8], RB
+ | mov [BASE+RA*8+8], RC
+ | mov [BASE+RA*8-8], TMPRd
+ | jmp <2
+ |
+ |7: // Skip holes in hash part.
+ | add RCd, 1
+ | jmp <6
+ break;
+
+ case BC_ISNEXT:
+ | ins_AD // RA = base, RD = target (points to ITERN)
+ | mov CFUNC:RB, [BASE+RA*8-24]
+ | checkfunc CFUNC:RB, >5
+ | checktptp [BASE+RA*8-16], LJ_TTAB, >5
+ | cmp aword [BASE+RA*8-8], LJ_TNIL; jne >5
+ | cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
+ | branchPC RD
+ | mov64 TMPR, U64x(fffe7fff, 00000000)
+ | mov [BASE+RA*8-8], TMPR // Initialize control var.
+ |1:
+ | ins_next
+ |5: // Despecialize bytecode if any of the checks fail.
+ | mov PC_OP, BC_JMP
+ | branchPC RD
+ | mov byte [PC], BC_ITERC
+ | jmp <1
+ break;
+
+ case BC_VARG:
+ | ins_ABC // RA = base, RB = nresults+1, RC = numparams
+ | lea TMPR, [BASE+RC*8+(16+FRAME_VARG)]
+ | lea RA, [BASE+RA*8]
+ | sub TMPR, [BASE-8]
+ | // Note: TMPR may now be even _above_ BASE if nargs was < numparams.
+ | test RB, RB
+ | jz >5 // Copy all varargs?
+ | lea RB, [RA+RB*8-8]
+ | cmp TMPR, BASE // No vararg slots?
+ | jnb >2
+ |1: // Copy vararg slots to destination slots.
+ | mov RC, [TMPR-16]
+ | add TMPR, 8
+ | mov [RA], RC
+ | add RA, 8
+ | cmp RA, RB // All destination slots filled?
+ | jnb >3
+ | cmp TMPR, BASE // No more vararg slots?
+ | jb <1
+ |2: // Fill up remainder with nil.
+ | mov aword [RA], LJ_TNIL
+ | add RA, 8
+ | cmp RA, RB
+ | jb <2
+ |3:
+ | ins_next
+ |
+ |5: // Copy all varargs.
+ | mov MULTRES, 1 // MULTRES = 0+1
+ | mov RC, BASE
+ | sub RC, TMPR
+ | jbe <3 // No vararg slots?
+ | mov RBd, RCd
+ | shr RBd, 3
+ | add RBd, 1
+ | mov MULTRES, RBd // MULTRES = #varargs+1
+ | mov L:RB, SAVE_L
+ | add RC, RA
+ | cmp RC, L:RB->maxstack
+ | ja >7 // Need to grow stack?
+ |6: // Copy all vararg slots.
+ | mov RC, [TMPR-16]
+ | add TMPR, 8
+ | mov [RA], RC
+ | add RA, 8
+ | cmp TMPR, BASE // No more vararg slots?
+ | jb <6
+ | jmp <3
+ |
+ |7: // Grow stack for varargs.
+ | mov L:RB->base, BASE
+ | mov L:RB->top, RA
+ | mov SAVE_PC, PC
+ | sub TMPR, BASE // Need delta, because BASE may change.
+ | mov TMP1hi, TMPRd
+ | mov CARG2d, MULTRES
+ | sub CARG2d, 1
+ | mov CARG1, L:RB
+ | call extern lj_state_growstack // (lua_State *L, int n)
+ | mov BASE, L:RB->base
+ | movsxd TMPR, TMP1hi
+ | mov RA, L:RB->top
+ | add TMPR, BASE
+ | jmp <6
+ break;
+
+ /* -- Returns ----------------------------------------------------------- */
+
+ case BC_RETM:
+ | ins_AD // RA = results, RD = extra_nresults
+ | add RDd, MULTRES // MULTRES >=1, so RD >=1.
+ | // Fall through. Assumes BC_RET follows and ins_AD is a no-op.
+ break;
+
+ case BC_RET: case BC_RET0: case BC_RET1:
+ | ins_AD // RA = results, RD = nresults+1
+ if (op != BC_RET0) {
+ | shl RAd, 3
+ }
+ |1:
+ | mov PC, [BASE-8]
+ | mov MULTRES, RDd // Save nresults+1.
+ | test PCd, FRAME_TYPE // Check frame type marker.
+ | jnz >7 // Not returning to a fixarg Lua func?
+ switch (op) {
+ case BC_RET:
+ |->BC_RET_Z:
+ | mov KBASE, BASE // Use KBASE for result move.
+ | sub RDd, 1
+ | jz >3
+ |2: // Move results down.
+ | mov RB, [KBASE+RA]
+ | mov [KBASE-16], RB
+ | add KBASE, 8
+ | sub RDd, 1
+ | jnz <2
+ |3:
+ | mov RDd, MULTRES // Note: MULTRES may be >255.
+ | movzx RBd, PC_RB // So cannot compare with RDL!
+ |5:
+ | cmp RBd, RDd // More results expected?
+ | ja >6
+ break;
+ case BC_RET1:
+ | mov RB, [BASE+RA]
+ | mov [BASE-16], RB
+ /* fallthrough */
+ case BC_RET0:
+ |5:
+ | cmp PC_RB, RDL // More results expected?
+ | ja >6
+ default:
+ break;
+ }
+ | movzx RAd, PC_RA
+ | neg RA
+ | lea BASE, [BASE+RA*8-16] // base = base - (RA+2)*8
+ | mov LFUNC:KBASE, [BASE-16]
+ | cleartp LFUNC:KBASE
+ | mov KBASE, LFUNC:KBASE->pc
+ | mov KBASE, [KBASE+PC2PROTO(k)]
+ | ins_next
+ |
+ |6: // Fill up results with nil.
+ if (op == BC_RET) {
+ | mov aword [KBASE-16], LJ_TNIL // Note: relies on shifted base.
+ | add KBASE, 8
+ } else {
+ | mov aword [BASE+RD*8-24], LJ_TNIL
+ }
+ | add RD, 1
+ | jmp <5
+ |
+ |7: // Non-standard return case.
+ | lea RB, [PC-FRAME_VARG]
+ | test RBd, FRAME_TYPEP
+ | jnz ->vm_return
+ | // Return from vararg function: relocate BASE down and RA up.
+ | sub BASE, RB
+ if (op != BC_RET0) {
+ | add RA, RB
+ }
+ | jmp <1
+ break;
+
+ /* -- Loops and branches ------------------------------------------------ */
+
+ |.define FOR_IDX, [RA]
+ |.define FOR_STOP, [RA+8]
+ |.define FOR_STEP, [RA+16]
+ |.define FOR_EXT, [RA+24]
+
+ case BC_FORL:
+ |.if JIT
+ | hotloop RBd
+ |.endif
+ | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
+ break;
+
+ case BC_JFORI:
+ case BC_JFORL:
+#if !LJ_HASJIT
+ break;
+#endif
+ case BC_FORI:
+ case BC_IFORL:
+ vk = (op == BC_IFORL || op == BC_JFORL);
+ | ins_AJ // RA = base, RD = target (after end of loop or start of loop)
+ | lea RA, [BASE+RA*8]
+ if (LJ_DUALNUM) {
+ | mov RB, FOR_IDX
+ | checkint RB, >9
+ | mov TMPR, FOR_STOP
+ if (!vk) {
+ | checkint TMPR, ->vmeta_for
+ | mov ITYPE, FOR_STEP
+ | test ITYPEd, ITYPEd; js >5
+ | sar ITYPE, 47;
+ | cmp ITYPEd, LJ_TISNUM; jne ->vmeta_for
+ } else {
+#ifdef LUA_USE_ASSERT
+ | checkinttp FOR_STOP, ->assert_bad_for_arg_type
+ | checkinttp FOR_STEP, ->assert_bad_for_arg_type
+#endif
+ | mov ITYPE, FOR_STEP
+ | test ITYPEd, ITYPEd; js >5
+ | add RBd, ITYPEd; jo >1
+ | setint RB
+ | mov FOR_IDX, RB
+ }
+ | cmp RBd, TMPRd
+ | mov FOR_EXT, RB
+ if (op == BC_FORI) {
+ | jle >7
+ |1:
+ |6:
+ | branchPC RD
+ } else if (op == BC_JFORI) {
+ | branchPC RD
+ | movzx RDd, PC_RD
+ | jle =>BC_JLOOP
+ |1:
+ |6:
+ } else if (op == BC_IFORL) {
+ | jg >7
+ |6:
+ | branchPC RD
+ |1:
+ } else {
+ | jle =>BC_JLOOP
+ |1:
+ |6:
+ }
+ |7:
+ | ins_next
+ |
+ |5: // Invert check for negative step.
+ if (!vk) {
+ | sar ITYPE, 47;
+ | cmp ITYPEd, LJ_TISNUM; jne ->vmeta_for
+ } else {
+ | add RBd, ITYPEd; jo <1
+ | setint RB
+ | mov FOR_IDX, RB
+ }
+ | cmp RBd, TMPRd
+ | mov FOR_EXT, RB
+ if (op == BC_FORI) {
+ | jge <7
+ } else if (op == BC_JFORI) {
+ | branchPC RD
+ | movzx RDd, PC_RD
+ | jge =>BC_JLOOP
+ } else if (op == BC_IFORL) {
+ | jl <7
+ } else {
+ | jge =>BC_JLOOP
+ }
+ | jmp <6
+ |9: // Fallback to FP variant.
+ if (!vk) {
+ | jae ->vmeta_for
+ }
+ } else if (!vk) {
+ | checknumtp FOR_IDX, ->vmeta_for
+ }
+ if (!vk) {
+ | checknumtp FOR_STOP, ->vmeta_for
+ } else {
+#ifdef LUA_USE_ASSERT
+ | checknumtp FOR_STOP, ->assert_bad_for_arg_type
+ | checknumtp FOR_STEP, ->assert_bad_for_arg_type
+#endif
+ }
+ | mov RB, FOR_STEP
+ if (!vk) {
+ | checknum RB, ->vmeta_for
+ }
+ | movsd xmm0, qword FOR_IDX
+ | movsd xmm1, qword FOR_STOP
+ if (vk) {
+ | addsd xmm0, qword FOR_STEP
+ | movsd qword FOR_IDX, xmm0
+ | test RB, RB; js >3
+ } else {
+ | jl >3
+ }
+ | ucomisd xmm1, xmm0
+ |1:
+ | movsd qword FOR_EXT, xmm0
+ if (op == BC_FORI) {
+ |.if DUALNUM
+ | jnb <7
+ |.else
+ | jnb >2
+ | branchPC RD
+ |.endif
+ } else if (op == BC_JFORI) {
+ | branchPC RD
+ | movzx RDd, PC_RD
+ | jnb =>BC_JLOOP
+ } else if (op == BC_IFORL) {
+ |.if DUALNUM
+ | jb <7
+ |.else
+ | jb >2
+ | branchPC RD
+ |.endif
+ } else {
+ | jnb =>BC_JLOOP
+ }
+ |.if DUALNUM
+ | jmp <6
+ |.else
+ |2:
+ | ins_next
+ |.endif
+ |
+ |3: // Invert comparison if step is negative.
+ | ucomisd xmm0, xmm1
+ | jmp <1
+ break;
+
+ case BC_ITERL:
+ |.if JIT
+ | hotloop RBd
+ |.endif
+ | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
+ break;
+
+ case BC_JITERL:
+#if !LJ_HASJIT
+ break;
+#endif
+ case BC_IITERL:
+ | ins_AJ // RA = base, RD = target
+ | lea RA, [BASE+RA*8]
+ | mov RB, [RA]
+ | cmp RB, LJ_TNIL; je >1 // Stop if iterator returned nil.
+ if (op == BC_JITERL) {
+ | mov [RA-8], RB
+ | jmp =>BC_JLOOP
+ } else {
+ | branchPC RD // Otherwise save control var + branch.
+ | mov [RA-8], RB
+ }
+ |1:
+ | ins_next
+ break;
+
+ case BC_LOOP:
+ | ins_A // RA = base, RD = target (loop extent)
+ | // Note: RA/RD is only used by trace recorder to determine scope/extent
+ | // This opcode does NOT jump, it's only purpose is to detect a hot loop.
+ |.if JIT
+ | hotloop RBd
+ |.endif
+ | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
+ break;
+
+ case BC_ILOOP:
+ | ins_A // RA = base, RD = target (loop extent)
+ | ins_next
+ break;
+
+ case BC_JLOOP:
+ |.if JIT
+ | ins_AD // RA = base (ignored), RD = traceno
+ | mov RA, [DISPATCH+DISPATCH_J(trace)]
+ | mov TRACE:RD, [RA+RD*8]
+ | mov RD, TRACE:RD->mcode
+ | mov L:RB, SAVE_L
+ | mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
+ | mov [DISPATCH+DISPATCH_GL(tmpbuf.L)], L:RB
+ | // Save additional callee-save registers only used in compiled code.
+ |.if X64WIN
+ | mov CSAVE_4, r12
+ | mov CSAVE_3, r13
+ | mov CSAVE_2, r14
+ | mov CSAVE_1, r15
+ | mov RA, rsp
+ | sub rsp, 10*16+4*8
+ | movdqa [RA-1*16], xmm6
+ | movdqa [RA-2*16], xmm7
+ | movdqa [RA-3*16], xmm8
+ | movdqa [RA-4*16], xmm9
+ | movdqa [RA-5*16], xmm10
+ | movdqa [RA-6*16], xmm11
+ | movdqa [RA-7*16], xmm12
+ | movdqa [RA-8*16], xmm13
+ | movdqa [RA-9*16], xmm14
+ | movdqa [RA-10*16], xmm15
+ |.else
+ | sub rsp, 16
+ | mov [rsp+16], r12
+ | mov [rsp+8], r13
+ |.endif
+ | jmp RD
+ |.endif
+ break;
+
+ case BC_JMP:
+ | ins_AJ // RA = unused, RD = target
+ | branchPC RD
+ | ins_next
+ break;
+
+ /* -- Function headers -------------------------------------------------- */
+
+ /*
+ ** Reminder: A function may be called with func/args above L->maxstack,
+ ** i.e. occupying EXTRA_STACK slots. And vmeta_call may add one extra slot,
+ ** too. This means all FUNC* ops (including fast functions) must check
+ ** for stack overflow _before_ adding more slots!
+ */
+
+ case BC_FUNCF:
+ |.if JIT
+ | hotcall RBd
+ |.endif
+ case BC_FUNCV: /* NYI: compiled vararg functions. */
+ | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
+ break;
+
+ case BC_JFUNCF:
+#if !LJ_HASJIT
+ break;
+#endif
+ case BC_IFUNCF:
+ | ins_AD // BASE = new base, RA = framesize, RD = nargs+1
+ | mov KBASE, [PC-4+PC2PROTO(k)]
+ | mov L:RB, SAVE_L
+ | lea RA, [BASE+RA*8] // Top of frame.
+ | cmp RA, L:RB->maxstack
+ | ja ->vm_growstack_f
+ | movzx RAd, byte [PC-4+PC2PROTO(numparams)]
+ | cmp NARGS:RDd, RAd // Check for missing parameters.
+ | jbe >3
+ |2:
+ if (op == BC_JFUNCF) {
+ | movzx RDd, PC_RD
+ | jmp =>BC_JLOOP
+ } else {
+ | ins_next
+ }
+ |
+ |3: // Clear missing parameters.
+ | mov aword [BASE+NARGS:RD*8-8], LJ_TNIL
+ | add NARGS:RDd, 1
+ | cmp NARGS:RDd, RAd
+ | jbe <3
+ | jmp <2
+ break;
+
+ case BC_JFUNCV:
+#if !LJ_HASJIT
+ break;
+#endif
+ | int3 // NYI: compiled vararg functions
+ break; /* NYI: compiled vararg functions. */
+
+ case BC_IFUNCV:
+ | ins_AD // BASE = new base, RA = framesize, RD = nargs+1
+ | lea RBd, [NARGS:RD*8+FRAME_VARG+8]
+ | lea RD, [BASE+NARGS:RD*8+8]
+ | mov LFUNC:KBASE, [BASE-16]
+ | mov [RD-8], RB // Store delta + FRAME_VARG.
+ | mov [RD-16], LFUNC:KBASE // Store copy of LFUNC.
+ | mov L:RB, SAVE_L
+ | lea RA, [RD+RA*8]
+ | cmp RA, L:RB->maxstack
+ | ja ->vm_growstack_v // Need to grow stack.
+ | mov RA, BASE
+ | mov BASE, RD
+ | movzx RBd, byte [PC-4+PC2PROTO(numparams)]
+ | test RBd, RBd
+ | jz >2
+ | add RA, 8
+ |1: // Copy fixarg slots up to new frame.
+ | add RA, 8
+ | cmp RA, BASE
+ | jnb >3 // Less args than parameters?
+ | mov KBASE, [RA-16]
+ | mov [RD], KBASE
+ | add RD, 8
+ | mov aword [RA-16], LJ_TNIL // Clear old fixarg slot (help the GC).
+ | sub RBd, 1
+ | jnz <1
+ |2:
+ if (op == BC_JFUNCV) {
+ | movzx RDd, PC_RD
+ | jmp =>BC_JLOOP
+ } else {
+ | mov KBASE, [PC-4+PC2PROTO(k)]
+ | ins_next
+ }
+ |
+ |3: // Clear missing parameters.
+ | mov aword [RD], LJ_TNIL
+ | add RD, 8
+ | sub RBd, 1
+ | jnz <3
+ | jmp <2
+ break;
+
+ case BC_FUNCC:
+ case BC_FUNCCW:
+ | ins_AD // BASE = new base, RA = ins RA|RD (unused), RD = nargs+1
+ | mov CFUNC:RB, [BASE-16]
+ | cleartp CFUNC:RB
+ | mov KBASE, CFUNC:RB->f
+ | mov L:RB, SAVE_L
+ | lea RD, [BASE+NARGS:RD*8-8]
+ | mov L:RB->base, BASE
+ | lea RA, [RD+8*LUA_MINSTACK]
+ | cmp RA, L:RB->maxstack
+ | mov L:RB->top, RD
+ if (op == BC_FUNCC) {
+ | mov CARG1, L:RB // Caveat: CARG1 may be RA.
+ } else {
+ | mov CARG2, KBASE
+ | mov CARG1, L:RB // Caveat: CARG1 may be RA.
+ }
+ | ja ->vm_growstack_c // Need to grow stack.
+ | set_vmstate C
+ if (op == BC_FUNCC) {
+ | call KBASE // (lua_State *L)
+ } else {
+ | // (lua_State *L, lua_CFunction f)
+ | call aword [DISPATCH+DISPATCH_GL(wrapf)]
+ }
+ | // nresults returned in eax (RD).
+ | mov BASE, L:RB->base
+ | mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+ | set_vmstate INTERP
+ | lea RA, [BASE+RD*8]
+ | neg RA
+ | add RA, L:RB->top // RA = (L->top-(L->base+nresults))*8
+ | mov PC, [BASE-8] // Fetch PC of caller.
+ | jmp ->vm_returnc
+ break;
+
+ /* ---------------------------------------------------------------------- */
+
+ default:
+ fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
+ exit(2);
+ break;
+ }
+}
+
+static int build_backend(BuildCtx *ctx)
+{
+ int op;
+ dasm_growpc(Dst, BC__MAX);
+ build_subroutines(ctx);
+ |.code_op
+ for (op = 0; op < BC__MAX; op++)
+ build_ins(ctx, (BCOp)op, op);
+ return BC__MAX;
+}
+
+/* Emit pseudo frame-info for all assembler functions. */
+static void emit_asm_debug(BuildCtx *ctx)
+{
+ int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
+ switch (ctx->mode) {
+ case BUILD_elfasm:
+ fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
+ fprintf(ctx->fp,
+ ".Lframe0:\n"
+ "\t.long .LECIE0-.LSCIE0\n"
+ ".LSCIE0:\n"
+ "\t.long 0xffffffff\n"
+ "\t.byte 0x1\n"
+ "\t.string \"\"\n"
+ "\t.uleb128 0x1\n"
+ "\t.sleb128 -8\n"
+ "\t.byte 0x10\n"
+ "\t.byte 0xc\n\t.uleb128 0x7\n\t.uleb128 8\n"
+ "\t.byte 0x80+0x10\n\t.uleb128 0x1\n"
+ "\t.align 8\n"
+ ".LECIE0:\n\n");
+ fprintf(ctx->fp,
+ ".LSFDE0:\n"
+ "\t.long .LEFDE0-.LASFDE0\n"
+ ".LASFDE0:\n"
+ "\t.long .Lframe0\n"
+ "\t.quad .Lbegin\n"
+ "\t.quad %d\n"
+ "\t.byte 0xe\n\t.uleb128 %d\n" /* def_cfa_offset */
+ "\t.byte 0x86\n\t.uleb128 0x2\n" /* offset rbp */
+ "\t.byte 0x83\n\t.uleb128 0x3\n" /* offset rbx */
+ "\t.byte 0x8f\n\t.uleb128 0x4\n" /* offset r15 */
+ "\t.byte 0x8e\n\t.uleb128 0x5\n" /* offset r14 */
+#if LJ_NO_UNWIND
+ "\t.byte 0x8d\n\t.uleb128 0x6\n" /* offset r13 */
+ "\t.byte 0x8c\n\t.uleb128 0x7\n" /* offset r12 */
+#endif
+ "\t.align 8\n"
+ ".LEFDE0:\n\n", fcofs, CFRAME_SIZE);
+#if LJ_HASFFI
+ fprintf(ctx->fp,
+ ".LSFDE1:\n"
+ "\t.long .LEFDE1-.LASFDE1\n"
+ ".LASFDE1:\n"
+ "\t.long .Lframe0\n"
+ "\t.quad lj_vm_ffi_call\n"
+ "\t.quad %d\n"
+ "\t.byte 0xe\n\t.uleb128 16\n" /* def_cfa_offset */
+ "\t.byte 0x86\n\t.uleb128 0x2\n" /* offset rbp */
+ "\t.byte 0xd\n\t.uleb128 0x6\n" /* def_cfa_register rbp */
+ "\t.byte 0x83\n\t.uleb128 0x3\n" /* offset rbx */
+ "\t.align 8\n"
+ ".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
+#endif
+#if !LJ_NO_UNWIND
+#if (defined(__sun__) && defined(__svr4__))
+ fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@unwind\n");
+#else
+ fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
+#endif
+ fprintf(ctx->fp,
+ ".Lframe1:\n"
+ "\t.long .LECIE1-.LSCIE1\n"
+ ".LSCIE1:\n"
+ "\t.long 0\n"
+ "\t.byte 0x1\n"
+ "\t.string \"zPR\"\n"
+ "\t.uleb128 0x1\n"
+ "\t.sleb128 -8\n"
+ "\t.byte 0x10\n"
+ "\t.uleb128 6\n" /* augmentation length */
+ "\t.byte 0x1b\n" /* pcrel|sdata4 */
+ "\t.long lj_err_unwind_dwarf-.\n"
+ "\t.byte 0x1b\n" /* pcrel|sdata4 */
+ "\t.byte 0xc\n\t.uleb128 0x7\n\t.uleb128 8\n"
+ "\t.byte 0x80+0x10\n\t.uleb128 0x1\n"
+ "\t.align 8\n"
+ ".LECIE1:\n\n");
+ fprintf(ctx->fp,
+ ".LSFDE2:\n"
+ "\t.long .LEFDE2-.LASFDE2\n"
+ ".LASFDE2:\n"
+ "\t.long .LASFDE2-.Lframe1\n"
+ "\t.long .Lbegin-.\n"
+ "\t.long %d\n"
+ "\t.uleb128 0\n" /* augmentation length */
+ "\t.byte 0xe\n\t.uleb128 %d\n" /* def_cfa_offset */
+ "\t.byte 0x86\n\t.uleb128 0x2\n" /* offset rbp */
+ "\t.byte 0x83\n\t.uleb128 0x3\n" /* offset rbx */
+ "\t.byte 0x8f\n\t.uleb128 0x4\n" /* offset r15 */
+ "\t.byte 0x8e\n\t.uleb128 0x5\n" /* offset r14 */
+ "\t.align 8\n"
+ ".LEFDE2:\n\n", fcofs, CFRAME_SIZE);
+#if LJ_HASFFI
+ fprintf(ctx->fp,
+ ".Lframe2:\n"
+ "\t.long .LECIE2-.LSCIE2\n"
+ ".LSCIE2:\n"
+ "\t.long 0\n"
+ "\t.byte 0x1\n"
+ "\t.string \"zR\"\n"
+ "\t.uleb128 0x1\n"
+ "\t.sleb128 -8\n"
+ "\t.byte 0x10\n"
+ "\t.uleb128 1\n" /* augmentation length */
+ "\t.byte 0x1b\n" /* pcrel|sdata4 */
+ "\t.byte 0xc\n\t.uleb128 0x7\n\t.uleb128 8\n"
+ "\t.byte 0x80+0x10\n\t.uleb128 0x1\n"
+ "\t.align 8\n"
+ ".LECIE2:\n\n");
+ fprintf(ctx->fp,
+ ".LSFDE3:\n"
+ "\t.long .LEFDE3-.LASFDE3\n"
+ ".LASFDE3:\n"
+ "\t.long .LASFDE3-.Lframe2\n"
+ "\t.long lj_vm_ffi_call-.\n"
+ "\t.long %d\n"
+ "\t.uleb128 0\n" /* augmentation length */
+ "\t.byte 0xe\n\t.uleb128 16\n" /* def_cfa_offset */
+ "\t.byte 0x86\n\t.uleb128 0x2\n" /* offset rbp */
+ "\t.byte 0xd\n\t.uleb128 0x6\n" /* def_cfa_register rbp */
+ "\t.byte 0x83\n\t.uleb128 0x3\n" /* offset rbx */
+ "\t.align 8\n"
+ ".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
+#endif
+#endif
+ break;
+#if !LJ_NO_UNWIND
+ /* Mental note: never let Apple design an assembler.
+ ** Or a linker. Or a plastic case. But I digress.
+ */
+ case BUILD_machasm: {
+#if LJ_HASFFI
+ int fcsize = 0;
+#endif
+ int i;
+ fprintf(ctx->fp, "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support\n");
+ fprintf(ctx->fp,
+ "EH_frame1:\n"
+ "\t.set L$set$x,LECIEX-LSCIEX\n"
+ "\t.long L$set$x\n"
+ "LSCIEX:\n"
+ "\t.long 0\n"
+ "\t.byte 0x1\n"
+ "\t.ascii \"zPR\\0\"\n"
+ "\t.byte 0x1\n"
+ "\t.byte 128-8\n"
+ "\t.byte 0x10\n"
+ "\t.byte 6\n" /* augmentation length */
+ "\t.byte 0x9b\n" /* indirect|pcrel|sdata4 */
+ "\t.long _lj_err_unwind_dwarf+4@GOTPCREL\n"
+ "\t.byte 0x1b\n" /* pcrel|sdata4 */
+ "\t.byte 0xc\n\t.byte 0x7\n\t.byte 8\n"
+ "\t.byte 0x80+0x10\n\t.byte 0x1\n"
+ "\t.align 3\n"
+ "LECIEX:\n\n");
+ for (i = 0; i < ctx->nsym; i++) {
+ const char *name = ctx->sym[i].name;
+ int32_t size = ctx->sym[i+1].ofs - ctx->sym[i].ofs;
+ if (size == 0) continue;
+#if LJ_HASFFI
+ if (!strcmp(name, "_lj_vm_ffi_call")) { fcsize = size; continue; }
+#endif
+ fprintf(ctx->fp,
+ "%s.eh:\n"
+ "LSFDE%d:\n"
+ "\t.set L$set$%d,LEFDE%d-LASFDE%d\n"
+ "\t.long L$set$%d\n"
+ "LASFDE%d:\n"
+ "\t.long LASFDE%d-EH_frame1\n"
+ "\t.long %s-.\n"
+ "\t.long %d\n"
+ "\t.byte 0\n" /* augmentation length */
+ "\t.byte 0xe\n\t.byte %d\n" /* def_cfa_offset */
+ "\t.byte 0x86\n\t.byte 0x2\n" /* offset rbp */
+ "\t.byte 0x83\n\t.byte 0x3\n" /* offset rbx */
+ "\t.byte 0x8f\n\t.byte 0x4\n" /* offset r15 */
+ "\t.byte 0x8e\n\t.byte 0x5\n" /* offset r14 */
+ "\t.align 3\n"
+ "LEFDE%d:\n\n",
+ name, i, i, i, i, i, i, i, name, size, CFRAME_SIZE, i);
+ }
+#if LJ_HASFFI
+ if (fcsize) {
+ fprintf(ctx->fp,
+ "EH_frame2:\n"
+ "\t.set L$set$y,LECIEY-LSCIEY\n"
+ "\t.long L$set$y\n"
+ "LSCIEY:\n"
+ "\t.long 0\n"
+ "\t.byte 0x1\n"
+ "\t.ascii \"zR\\0\"\n"
+ "\t.byte 0x1\n"
+ "\t.byte 128-8\n"
+ "\t.byte 0x10\n"
+ "\t.byte 1\n" /* augmentation length */
+ "\t.byte 0x1b\n" /* pcrel|sdata4 */
+ "\t.byte 0xc\n\t.byte 0x7\n\t.byte 8\n"
+ "\t.byte 0x80+0x10\n\t.byte 0x1\n"
+ "\t.align 3\n"
+ "LECIEY:\n\n");
+ fprintf(ctx->fp,
+ "_lj_vm_ffi_call.eh:\n"
+ "LSFDEY:\n"
+ "\t.set L$set$yy,LEFDEY-LASFDEY\n"
+ "\t.long L$set$yy\n"
+ "LASFDEY:\n"
+ "\t.long LASFDEY-EH_frame2\n"
+ "\t.long _lj_vm_ffi_call-.\n"
+ "\t.long %d\n"
+ "\t.byte 0\n" /* augmentation length */
+ "\t.byte 0xe\n\t.byte 16\n" /* def_cfa_offset */
+ "\t.byte 0x86\n\t.byte 0x2\n" /* offset rbp */
+ "\t.byte 0xd\n\t.byte 0x6\n" /* def_cfa_register rbp */
+ "\t.byte 0x83\n\t.byte 0x3\n" /* offset rbx */
+ "\t.align 3\n"
+ "LEFDEY:\n\n", fcsize);
+ }
+#endif
+ fprintf(ctx->fp, ".subsections_via_symbols\n");
+ }
+ break;
+#endif
+ default: /* Difficult for other modes. */
+ break;
+ }
+}
+
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index ea0415e..96ac1da 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -222,6 +222,9 @@
|.define CFRAME_SPACE, aword*5 // Delta for rsp (see <--).
|.macro saveregs_
| push rbx; push r15; push r14
+|.if NO_UNWIND
+| push r13; push r12
+|.endif
| sub rsp, CFRAME_SPACE
|.endmacro
|.macro saveregs
@@ -229,15 +232,28 @@
|.endmacro
|.macro restoreregs
| add rsp, CFRAME_SPACE
+|.if NO_UNWIND
+| pop r12; pop r13
+|.endif
| pop r14; pop r15; pop rbx; pop rbp
|.endmacro
|
|//----- 16 byte aligned,
+|.if NO_UNWIND
+|.define SAVE_RET, aword [rsp+aword*11] //<-- rsp entering interpreter.
+|.define SAVE_R4, aword [rsp+aword*10]
+|.define SAVE_R3, aword [rsp+aword*9]
+|.define SAVE_R2, aword [rsp+aword*8]
+|.define SAVE_R1, aword [rsp+aword*7]
+|.define SAVE_RU2, aword [rsp+aword*6]
+|.define SAVE_RU1, aword [rsp+aword*5] //<-- rsp after register saves.
+|.else
|.define SAVE_RET, aword [rsp+aword*9] //<-- rsp entering interpreter.
|.define SAVE_R4, aword [rsp+aword*8]
|.define SAVE_R3, aword [rsp+aword*7]
|.define SAVE_R2, aword [rsp+aword*6]
|.define SAVE_R1, aword [rsp+aword*5] //<-- rsp after register saves.
+|.endif
|.define SAVE_CFRAME, aword [rsp+aword*4]
|.define SAVE_PC, dword [rsp+dword*7]
|.define SAVE_L, dword [rsp+dword*6]
@@ -2348,10 +2364,8 @@ static void build_subroutines(BuildCtx *ctx)
|
|//-- Bit library --------------------------------------------------------
|
- |.define TOBIT_BIAS, 0x59c00000 // 2^52 + 2^51 (float, not double!).
- |
- |.macro .ffunc_bit, name, kind
- | .ffunc_1 name
+ |.macro .ffunc_bit, name, kind, fdef
+ | fdef name
|.if kind == 2
| sseconst_tobit xmm1, RBa
|.endif
@@ -2378,6 +2392,10 @@ static void build_subroutines(BuildCtx *ctx)
|2:
|.endmacro
|
+ |.macro .ffunc_bit, name, kind
+ | .ffunc_bit name, kind, .ffunc_1
+ |.endmacro
+ |
|.ffunc_bit bit_tobit, 0
| jmp ->fff_resbit
|
@@ -2431,7 +2449,7 @@ static void build_subroutines(BuildCtx *ctx)
|
|.macro .ffunc_bit_sh, name, ins
|.if DUALNUM
- | .ffunc_bit name, 1
+ | .ffunc_bit name, 1, .ffunc_2
| // Note: no inline conversion from number for 2nd argument!
| cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
| mov RA, dword [BASE+8]
@@ -2649,8 +2667,8 @@ static void build_subroutines(BuildCtx *ctx)
|->cont_stitch: // Trace stitching.
|.if JIT
| // BASE = base, RC = result, RB = mbase
- | mov RA, [RB-24] // Save previous trace number.
- | mov TMP1, RA
+ | mov TRACE:RA, [RB-24] // Save previous trace.
+ | mov TMP1, TRACE:RA
| mov TMP3, DISPATCH // Need one more register.
| mov DISPATCH, MULTRES
| movzx RA, PC_RA
@@ -2681,11 +2699,8 @@ static void build_subroutines(BuildCtx *ctx)
| ja >9 // More results wanted?
|
| mov DISPATCH, TMP3
- | mov RB, TMP1 // Get previous trace number.
- | mov RA, [DISPATCH+DISPATCH_J(trace)]
- | mov TRACE:RD, [RA+RB*4]
- | test TRACE:RD, TRACE:RD
- | jz ->cont_nop
+ | mov TRACE:RD, TMP1 // Get previous trace.
+ | movzx RB, word TRACE:RD->traceno
| movzx RD, word TRACE:RD->link
| cmp RD, RB
| je ->cont_nop // Blacklisted.
@@ -5420,6 +5435,10 @@ static void emit_asm_debug(BuildCtx *ctx)
"\t.byte 0x83\n\t.uleb128 0x3\n" /* offset rbx */
"\t.byte 0x8f\n\t.uleb128 0x4\n" /* offset r15 */
"\t.byte 0x8e\n\t.uleb128 0x5\n" /* offset r14 */
+#if LJ_NO_UNWIND
+ "\t.byte 0x8d\n\t.uleb128 0x6\n" /* offset r13 */
+ "\t.byte 0x8c\n\t.uleb128 0x7\n" /* offset r12 */
+#endif
#else
"\t.long .Lbegin\n"
"\t.long %d\n"
@@ -5455,6 +5474,7 @@ static void emit_asm_debug(BuildCtx *ctx)
"\t.align " SZPTR "\n"
".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
#endif
+#if !LJ_NO_UNWIND
#if (defined(__sun__) && defined(__svr4__))
#if LJ_64
fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@unwind\n");
@@ -5543,7 +5563,9 @@ static void emit_asm_debug(BuildCtx *ctx)
"\t.align " SZPTR "\n"
".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
#endif
+#endif
break;
+#if !LJ_NO_UNWIND
/* Mental note: never let Apple design an assembler.
** Or a linker. Or a plastic case. But I digress.
*/
@@ -5677,6 +5699,7 @@ static void emit_asm_debug(BuildCtx *ctx)
fprintf(ctx->fp, ".subsections_via_symbols\n");
}
break;
+#endif
default: /* Difficult for other modes. */
break;
}
diff --git a/src/xb1build.bat b/src/xb1build.bat
new file mode 100644
index 0000000..847e84a
--- /dev/null
+++ b/src/xb1build.bat
@@ -0,0 +1,101 @@
+@rem Script to build LuaJIT with the Xbox One SDK.
+@rem Donated to the public domain.
+@rem
+@rem Open a "Visual Studio .NET Command Prompt" (64 bit host compiler)
+@rem Then cd to this directory and run this script.
+
+@if not defined INCLUDE goto :FAIL
+@if not defined DurangoXDK goto :FAIL
+
+@setlocal
+@echo ---- Host compiler ----
+@set LJCOMPILE=cl /nologo /c /MD /O2 /W3 /D_CRT_SECURE_NO_DEPRECATE /DLUAJIT_ENABLE_GC64
+@set LJLINK=link /nologo
+@set LJMT=mt /nologo
+@set DASMDIR=..\dynasm
+@set DASM=%DASMDIR%\dynasm.lua
+@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c
+
+%LJCOMPILE% host\minilua.c
+@if errorlevel 1 goto :BAD
+%LJLINK% /out:minilua.exe minilua.obj
+@if errorlevel 1 goto :BAD
+if exist minilua.exe.manifest^
+ %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe
+
+@rem Error out for 64 bit host compiler
+@minilua
+@if not errorlevel 8 goto :FAIL
+
+@set DASMFLAGS=-D WIN -D FFI -D P64
+minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x64.dasc
+@if errorlevel 1 goto :BAD
+
+%LJCOMPILE% /I "." /I %DASMDIR% /D_DURANGO host\buildvm*.c
+@if errorlevel 1 goto :BAD
+%LJLINK% /out:buildvm.exe buildvm*.obj
+@if errorlevel 1 goto :BAD
+if exist buildvm.exe.manifest^
+ %LJMT% -manifest buildvm.exe.manifest -outputresource:buildvm.exe
+
+buildvm -m peobj -o lj_vm.obj
+@if errorlevel 1 goto :BAD
+buildvm -m bcdef -o lj_bcdef.h %ALL_LIB%
+@if errorlevel 1 goto :BAD
+buildvm -m ffdef -o lj_ffdef.h %ALL_LIB%
+@if errorlevel 1 goto :BAD
+buildvm -m libdef -o lj_libdef.h %ALL_LIB%
+@if errorlevel 1 goto :BAD
+buildvm -m recdef -o lj_recdef.h %ALL_LIB%
+@if errorlevel 1 goto :BAD
+buildvm -m vmdef -o jit\vmdef.lua %ALL_LIB%
+@if errorlevel 1 goto :BAD
+buildvm -m folddef -o lj_folddef.h lj_opt_fold.c
+@if errorlevel 1 goto :BAD
+
+@echo ---- Cross compiler ----
+
+@set CWD=%cd%
+@call "%DurangoXDK%\xdk\DurangoVars.cmd" XDK
+@cd /D "%CWD%"
+@shift
+
+@set LJCOMPILE="cl" /nologo /c /W3 /GF /Gm- /GR- /GS- /Gy /openmp- /D_CRT_SECURE_NO_DEPRECATE /D_LIB /D_UNICODE /D_DURANGO
+@set LJLIB="lib" /nologo
+
+@if "%1"=="debug" (
+ @shift
+ @set LJCOMPILE=%LJCOMPILE% /Zi /MDd /Od
+ @set LJLINK=%LJLINK% /debug
+) else (
+ @set LJCOMPILE=%LJCOMPILE% /MD /O2 /DNDEBUG
+)
+
+@if "%1"=="amalg" goto :AMALG
+%LJCOMPILE% /DLUA_BUILD_AS_DLL lj_*.c lib_*.c
+@if errorlevel 1 goto :BAD
+%LJLIB% /OUT:luajit.lib lj_*.obj lib_*.obj
+@if errorlevel 1 goto :BAD
+@goto :NOAMALG
+:AMALG
+%LJCOMPILE% /DLUA_BUILD_AS_DLL ljamalg.c
+@if errorlevel 1 goto :BAD
+%LJLIB% /OUT:luajit.lib ljamalg.obj lj_vm.obj
+@if errorlevel 1 goto :BAD
+:NOAMALG
+
+@del *.obj *.manifest minilua.exe buildvm.exe
+@echo.
+@echo === Successfully built LuaJIT for Xbox One ===
+
+@goto :END
+:BAD
+@echo.
+@echo *******************************************************
+@echo *** Build FAILED -- Please check the error messages ***
+@echo *******************************************************
+@goto :END
+:FAIL
+@echo To run this script you must open a "Visual Studio .NET Command Prompt"
+@echo (64 bit host compiler). The Xbox One SDK must be installed, too.
+:END