Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/luajit-rocks.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRonan Collobert <locronan@fb.com>2017-04-18 20:41:57 +0300
committerRonan Collobert <locronan@fb.com>2017-04-18 20:41:57 +0300
commitabd5cbe0a68af28f4194b82c2b038c87b4ff81cd (patch)
tree2f2335647e47368965d4080729a82ecebd68ac36 /src/lj_emit_arm64.h
parent320a2b2b39e8b01a632d2bf5411eee0c0b3aef4f (diff)
Squashed 'luajit-2.1/' changes from 3a0b2a90..dc5eb65c
dc5eb65c Merge branch 'master' into v2.1 02b4b1e5 Add workaround for MSVC 2015 stdio changes. ed54eace MIPS64: Fix stores of MULTRES. 58aaac3c MIPS64: Fix write barrier in BC_USETV. 18efb331 ARM64: Fix stores to vmstate. cced1786 Document added C API extensions from Lua 5.2/5.3. de26f76e From Lua 5.2: Add lua_tonumberx() and lua_tointegerx(). 2b8de8cf From Lua 5.2: Add luaL_setmetatable(). cde968f9 From Lua 5.2: Add luaL_testudata(). f2e2a3f7 From Lua 5.3: Add lua_isyieldable(). ef23b70e From Lua 5.2: Add lua_copy(). c67a0982 From Lua 5.2: Add lua_version(). 9c685f70 Refactor with LUA_OK. 22dfa632 Allow building on Haiku OS. 7c7843e8 Merge branch 'master' into v2.1 247b3624 OSX: Fix build with recent XCode. de97b9d5 Add some more changes and extensions from Lua 5.2. dc320ca7 Remove old Lua 5.0 compatibility defines. c29afcb1 Merge branch 'master' into v2.1 e205ad0c FFI: Fix FOLD rules for int64_t comparisons. 3143b218 ARM64: Add big-endian support. 78f5f1ce x64/LJ_GC64: Fix emit_loadk64(). 024ade79 LJ_GC64: Fix BC_CALLM snapshot handling. fa126c5f x64/LJ_GC64: Fix assembly of CNEWI with 64 bit constant pointer. 779a1eb8 ARM64: Fix Nintendo Switch build. 0cf78854 ARM64: Fix XLOAD/XSTORE with FP operand. de5568e0 Remove Lua 5.0 compatibility defines. d3e36e79 Merge branch 'master' into v2.1 f50bf758 Remove unnecessary mcode alloc pointer check. d62459fc Limit mcode alloc probing, depending on the available pool size. 4e308361 Fix overly restrictive range calculation in mcode allocation. ff648369 Fix out-of-scope goto handling in parser. 3ab9f5a1 Remove internal __mode = "K" and replace with safe check. eef77a6d Fix annoying warning, due to deterministic binutils configuration. a9740d9e DynASM: Fix warning. a25c0b99 MIPS64, part 2: Add MIPS64 hard-float JIT compiler backend. 4416e885 Fix FOLD rules for math.abs() and FP negation. 019fb9d1 Fix soft-float math.abs() and negation. 130d1dc5 x64/LJ_GC64: Fix warning for DUALNUM build. f640ec71 x64/LJ_GC64: Fix (currently unused) integer stores in asm_tvptr(). 0a46ef1a ARM64: Cleanup and de-cargo-cult TValue store generation. d0759e41 Merge branch 'master' into v2.1 892d370e MIPS: Don't use RID_GP as a scratch register. ee33a1f9 MIPS: Fix emitted code for U32 to float conversion. 1abd7799 MIPS: Backport workaround for compact unwind tables. 6bf3e4d6 Make checkptrGC() actually work. 5aa02013 ARM64: Fix AREF/HREF/UREF fusion. bd7e42e5 Fix extension docs about package.searchers. 71ff7ef8 Merge branch 'master' into v2.1 b93a1dd0 Bump copyright date to 2017. c94b921f LJ_GC64: Add build options and install instructions. c1981676 Add some more extensions from Lua 5.2/5.3. ed4ce98a Merge branch 'master' into v2.1 a1e13fa6 Fix HTML formatting. 12c0df41 Merge branch 'master' into v2.1 a2013dd3 Fix cross-endian jit.bcsave for MIPS target. 8e5d7bec ARM64: Remove unused variables in disassembler. ebec2530 ARM64: Fuse BOR/BXOR and BNOT into ORN/EON. 3cfa9cb2 Merge branch 'master' into v2.1 fb61f7cb Add "proto" field to jit.util.funcinfo(). 19738074 Add "proto" field to jit.util.funcinfo(). 4ccd876a ARM64: Use the correct FUSE check. 44b99ff1 ARM64: Fuse BOR(BSHL, BSHR) into EXTR/ROR. ec2756ba Add missing FOLD rule for 64 bit shift+BAND simplification. 986854cb ARM64: Fix code generation for S19 offsets. 3975b6c9 ARM64: Fuse various BAND/BSHL/BSHR/BSAR combinations. 2772cbc3 ARM64: Fuse FP multiply-add/sub. bfeb1167 ARM64: Fuse XLOAD/XSTORE with STRREF/ADD/BSHL/CONV. 2ac2cd46 ARM64: Reorganize operand extension definitions. 48b00297 ARM64: Add missing ldrb/strb instructions to disassembler. 22511fbe ARM64: Fix pc-relative loads of consts. Cleanup branch codegen. 3ad2bbf5 ARM64: Make use of tbz/tbnz and cbz/cbnz. 6538c8a1 Document 47 bit limit for lightuserdata. d7243e1d Eliminate use of lightuserdata derived from static data pointers. 81259898 ARM64: Emit more efficient trace exits. 1131fa22 Merge branch 'master' into v2.1 c3cae041 Update contact info. a5665446 Generalize deferred constant handling in backend to 64 bit. 2b77da35 ARM64: Reject special case in emit_isk13(). 7a0c3a11 ARM64: Allow full VA range for mcode allocation. 04b60707 ARM64: Add JIT compiler backend. 13642b75 Whitespace. 202713a6 Fix amalgamated build. e577db52 Increase range of GG_State loads via IR_FLOAD with REF_NIL. 5400c1e4 MIPS: Fix TSETR barrier. 7a58a8fb Report parent of stitched trace. 716f2dae LJ_GC64: Various followup fixes. bdcaf4bf LJ_GC64: Fix HREF for pointers. 6a25014c LJ_FR2: Fix slot 1 handling. 3f43f094 Merge branch 'master' into v2.1 a68c4118 Fix GC step size calculation. 54b78e7c LJ_GC64: Various fixes. 63465fe7 LJ_GC64: Fix jit.on/off. cf80edbb Fix -jp=a mode for builtins. f27b2509 Merge branch 'master' into v2.1 fcc82448 ARM: Fix BLX encoding for Thumb interworking calls. 4ca39095 Merge branch 'master' into v2.1 8ada57eb Looks like COLORTERM has gone out of fashion. 1a9a2643 Merge branch 'master' into v2.1 9910deda Initialize uv->immutable for upvalues of loaded chunks. 03b03ef6 Windows/x86: Add MSVC flags for debug build with exception interop. f6f838f8 Merge branch 'master' into v2.1 02b9b559 Revert "OSX: Switch to Clang as the default compiler." 972a1a4c Fix exit status for 'luajit -b'. c98660c8 Must preserve J->fold.ins (fins) around call to lj_ir_ksimd(). 22e8e079 Merge branch 'master' into v2.1 d41469c1 Emit bytecode in .c/.h files with unsigned char type. 92d9ff21 Set arg table before evaluating LUA_INIT and -e chunks. 6be5ffdf Adjust comment with defines. 73740462 Merge branch 'master' into v2.1 b74ddaf1 Fix for cdata vs. non-cdata arithmetics/comparisons. 37b377de Merge branch 'master' into v2.1 1914de71 Fix unused vars etc. in internal Lua files. 01e47549 Properly clean up state before restart of trace assembly. ce30766b Merge branch 'master' into v2.1 aef4eddd Drop leftover regs in 'for' iterator assignment, too. 287a5347 MIPS: Support MIPS16 interlinking. f5983437 x64/LJ_GC64: Fix code generation for IR_KNULL call argument. a88dc579 Merge branch 'master' into v2.1 a7bec69a Fix PHI remarking in SINK pass. ce1ad870 LJ_GC64: Set correct nil value when clearing a cdata finalizer. cc05e791 LJ_GC64: Ensure all IR slot fields are initialized. 58ca1657 LJ_GC64: Allow optional use of the system memory allocator. 7d434023 Merge branch 'master' into v2.1 32063075 Fix Valgrind suppressions. 1c0454c6 Merge branch 'master' into v2.1 a5f8a481 Don't try to record outermost pcall() return to lower frame. 384ce2f9 MIPS: Fix build failures and warnings. 56fe899a Proper fix for LJ_GC64 changes to asm_href(). d9986fba MIPS64, part 1: Add MIPS64 support to interpreter. e3c4c9af DynASM/MIPS: Add missing MIPS64 instructions. 5e2b609b Fix compiler warning. e77638f9 x64/LJ_GC64: Fix __call metamethod for tailcall. 6360f6e1 Fix collateral damage from LJ_GC64 changes to asm_href(). 9fa843ae Cleanup install docs. 3f1031c3 Use MAP_TRYFIXED for the probing memory allocator, if available. bfe2a353 Merge branch 'master' into v2.1 747feb6e x86: Don't spill an explicit REF_BASE in the IR. 2868715d x64/LJ_GC64: Add missing backend support and enable JIT compilation. 6c8258d7 LJ_FR2: Add support for trace recording and snapshots. 8f868a9d LJ_GC64: Update IR type sizes. f26679c7 LJ_GC64: Add support for 64 bit GCobj constants in the IR. 9e99ccc3 Strip out old infrastructure for 64 bit constants. 7fb75ccc Embed 64 bit constants directly in the IR, using two slots. a4067978 Always walk IR constants in ascending order. 3152ed98 Simplify GCtrace * reference embedding for trace stitching. a657fa01 Make the IR immovable after assembly. 51358765 Add ra_addrename(). ccae3338 Load SIMD constants with IR_FLOAD from GG_State. 786dbb2e Add IR_FLOAD with REF_NIL for field loads from GG_State. cfa188f1 Move common 32/64 bit in-memory FP constants to jit_State. 1931b38d LJ_GC64: Introduce IRT_PGC. 475a6ae3 Merge branch 'master' into v2.1 37e1e703 Add guard for obscure aliasing between open upvalues and SSA slots. d4f3b113 Workaround for MinGW headers lacking some exception definitions. 4fe400cf Merge branch 'master' into v2.1 5837c2a2 Remove assumption that lj_math_random_step() doesn't clobber FPRs. 573daa9c Fix dependencies. 35b09e69 Windows/x86: Add full exception interoperability. 6a997320 Merge branch 'master' into v2.1 f05280e4 x86/x64: Fix instruction length decoder. 221268b1 Use the GDB JIT API in a thread-safe manner. ac42037d Constrain value range of lj_ir_kptr() to unsigned 32 bit pointers. d8ac6230 Merge branch 'master' into v2.1 7b26e9c9 Fix GCC 6 -Wmisleading-indentation warnings. 344fe5f0 Merge branch 'master' into v2.1 2f0001fa Fix handling of non-numeric strings in arithmetic coercions. 4c6498d2 Merge branch 'master' into v2.1 cc4f5d05 Whitespace. d13d4209 Merge branch 'master' into v2.1 73680a5f x86/x64: Search for exit jumps with instruction length decoder. 0c6fdc10 Rewrite memory block allocator. 101115dd Merge branch 'master' into v2.1 e5b5e079 MIPS: Fix BC_ISNEXT fallback path. 096a7cf4 x64/LJ_GC64: Fix BC_UCLO check for fast-path. ac9193cf x86: Improve disassembly of BMI2 instructions. d150fbf4 Merge branch 'master' into v2.1 1c6fd13d Fix recording of select(n, ...) with off-trace varargs 25b37794 Merge branch 'master' into v2.1 4ab6367b Cygwin: Allow cross-builds to non-Cygwin targets. 296f0ca8 Windows/x64/LJ_GC64: Fix math.frexp() and math.modf() (again). 6e623b99 Merge branch 'master' into v2.1 62af1015 MIPS: Fix use of ffgccheck delay slots in interpreter. 892887e5 x86: Generate BMI2 shifts and rotates, if available. 6801e716 x86: Detect BMI2 instruction support. c24c8e53 x64/LJ_GC64: Fix JIT glue code in interpreter. d7145616 Merge branch 'master' into v2.1 9531eb23 Windows: Remove intermediate files at end of build. e03e5979 Fix compiler warnings. df7bb5bb Merge branch 'master' into v2.1 e23fc108 Fix display of NULL (light)userdata in -jdump. c7305408 Fix formatting of some small denormals at low precision. 713e3405 Merge branch 'master' into v2.1 7e05355a Fix install for cross-builds. cf3e01e1 Merge branch 'master' into v2.1 ddadbe80 Fix compiler warnings. be255929 ARM: Fix build problem with external frame unwinding. 64c6da6b MIPS soft-float: Fix code generation for HREF. 75d046db DynASM/x86: Add ADX instructions. 3e4a1967 RELEASE LuaJIT-2.1.0-beta2 82900761 Update changelog. f4231949 Merge branch 'master' into v2.1 db1b399a Bump copyright date to 2016. 6f3eed9f Fix Android/x86 build. 1c45c6a6 Fix build with JIT disabled. 18f6aa97 Use internal implementation for converting FP numbers to strings. 339a1fd6 Windows/x64/LJ_GC64: Fix math.frexp() and math.modf(). 6cb38f78 Merge branch 'master' into v2.1 a4438896 Don't allocate unused 2nd result register in JIT compiler backend. f547a142 MIPS: Add soft-float support to JIT compiler backend. 825dcdc4 Generalize LJ_SOFTFP dependencies in lj_asm.c. bfbcd72e PS4: Switch default build to amalgamated and LJ_GC64 mode. 60de2f3d MIPS: Switch to dual-number mode. Fix soft-float interpreter. 2f6b2967 Add proper DynASM dependency. 89982b78 FFI: Parse #line NN and #NN. 39eddd3b Always merge snapshots without instructions inbetween. 04d28068 Rollback due to HREFK + load fwd must restore guardemit state. 22e7b00d DynASM/x64: Fix for full VREG support. 52ebe02c x86: Disassemble AES instructions. 5ce6399e x86: Improve disassembly of AVX shift instructions. cfae3846 DynASM/x86: Add AVX AES instructions. f61148c4 ARM: Add external frame unwinding. a687a60e DynASM/x64: Add full VREG support. 20f4141b FFI: Properly unsink non-standard cdata allocations. 0345f361 MIPS soft-float, part 2: Add soft-float FFI support. 3f5c7242 MIPS soft-float, part 1: Add soft-float support to interpreter. 126e55d4 Merge branch 'master' into v2.1 3d4c9f96 FFI: Fix SPLIT pass for CONV i64.u64. git-subtree-dir: luajit-2.1 git-subtree-split: dc5eb65ccf8e0e944531c7407d4fff65247b784b
Diffstat (limited to 'src/lj_emit_arm64.h')
-rw-r--r--src/lj_emit_arm64.h419
1 files changed, 419 insertions, 0 deletions
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
new file mode 100644
index 0000000..6da4c7d
--- /dev/null
+++ b/src/lj_emit_arm64.h
@@ -0,0 +1,419 @@
+/*
+** ARM64 instruction emitter.
+** Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Constant encoding --------------------------------------------------- */
+
+static uint64_t get_k64val(IRIns *ir)
+{
+ if (ir->o == IR_KINT64) {
+ return ir_kint64(ir)->u64;
+ } else if (ir->o == IR_KGC) {
+ return (uint64_t)ir_kgc(ir);
+ } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
+ return (uint64_t)ir_kptr(ir);
+ } else {
+ lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL);
+ return ir->i; /* Sign-extended. */
+ }
+}
+
+/* Encode constant in K12 format for data processing instructions. */
+static uint32_t emit_isk12(int64_t n)
+{
+ uint64_t k = (n < 0) ? -n : n;
+ uint32_t m = (n < 0) ? 0x40000000 : 0;
+ if (k < 0x1000) {
+ return A64I_K12|m|A64F_U12(k);
+ } else if ((k & 0xfff000) == k) {
+ return A64I_K12|m|0x400000|A64F_U12(k>>12);
+ }
+ return 0;
+}
+
+#define emit_clz64(n) __builtin_clzll(n)
+#define emit_ctz64(n) __builtin_ctzll(n)
+
+/* Encode constant in K13 format for logical data processing instructions. */
+static uint32_t emit_isk13(uint64_t n, int is64)
+{
+ int inv = 0, w = 128, lz, tz;
+ if (n & 1) { n = ~n; w = 64; inv = 1; } /* Avoid wrap-around of ones. */
+ if (!n) return 0; /* Neither all-zero nor all-ones are allowed. */
+ do { /* Find the repeat width. */
+ if (is64 && (uint32_t)(n^(n>>32))) break;
+ n = (uint32_t)n;
+ if (!n) return 0; /* Ditto when passing n=0xffffffff and is64=0. */
+ w = 32; if ((n^(n>>16)) & 0xffff) break;
+ n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
+ n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
+ n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
+ n = n & 0x3; w = 2;
+ } while (0);
+ lz = emit_clz64(n);
+ tz = emit_ctz64(n);
+ if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
+ if (inv)
+ return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
+ else
+ return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
+}
+
+static uint32_t emit_isfpk64(uint64_t n)
+{
+ uint64_t etop9 = ((n >> 54) & 0x1ff);
+ if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
+ return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
+ }
+ return ~0u;
+}
+
+/* -- Emit basic instructions --------------------------------------------- */
+
+static void emit_dnma(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm, Reg ra)
+{
+ *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm) | A64F_A(ra);
+}
+
+static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
+{
+ *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
+{
+ *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
+}
+
+static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
+{
+ *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
+}
+
+static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
+{
+ *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_d(ASMState *as, A64Ins ai, Reg rd)
+{
+ *--as->mcp = ai | A64F_D(rd);
+}
+
+static void emit_n(ASMState *as, A64Ins ai, Reg rn)
+{
+ *--as->mcp = ai | A64F_N(rn);
+}
+
+static int emit_checkofs(A64Ins ai, int64_t ofs)
+{
+ int scale = (ai >> 30) & 3;
+ if (ofs < 0 || (ofs & ((1<<scale)-1))) {
+ return (ofs >= -256 && ofs <= 255) ? -1 : 0;
+ } else {
+ return (ofs < (4096<<scale)) ? 1 : 0;
+ }
+}
+
+static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
+{
+ int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
+ lua_assert(ot);
+ /* Combine LDR/STR pairs to LDP/STP. */
+ if ((sc == 2 || sc == 3) &&
+ (!(ai & 0x400000) || rd != rn) &&
+ as->mcp != as->mcloop) {
+ uint32_t prev = *as->mcp & ~A64F_D(31);
+ int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
+ A64Ins aip;
+ if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
+ prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
+ aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
+ } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
+ prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
+ aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
+ ofsm = ofs;
+ } else {
+ goto nopair;
+ }
+ if (ofsm >= (int)((unsigned int)-64<<sc) && ofsm <= (63<<sc)) {
+ *as->mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
+ (ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
+ return;
+ }
+ }
+nopair:
+ if (ot == 1)
+ *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
+ else
+ *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
+}
+
+/* -- Emit loads/stores --------------------------------------------------- */
+
+/* Prefer rematerialization of BASE/L from global_State over spills. */
+#define emit_canremat(ref) ((ref) <= ASMREF_L)
+
+/* Try to find an N-step delta relative to other consts with N < lim. */
+static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
+{
+ RegSet work = ~as->freeset & RSET_GPR;
+ if (lim <= 1) return 0; /* Can't beat that. */
+ while (work) {
+ Reg r = rset_picktop(work);
+ IRRef ref = regcost_ref(as->cost[r]);
+ lua_assert(r != rd);
+ if (ref < REF_TRUE) {
+ uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
+ get_k64val(IR(ref));
+ int64_t delta = (int64_t)(k - kx);
+ if (delta == 0) {
+ emit_dm(as, A64I_MOVx, rd, r);
+ return 1;
+ } else {
+ uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
+ if (k12) {
+ emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
+ return 1;
+ }
+ /* Do other ops or multi-step deltas pay off? Probably not.
+ ** E.g. XOR rarely helps with pointer consts.
+ */
+ }
+ }
+ rset_clear(work, r);
+ }
+ return 0; /* Failed. */
+}
+
+static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
+{
+ uint32_t k13 = emit_isk13(u64, is64);
+ if (k13) { /* Can the constant be represented as a bitmask immediate? */
+ emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
+ } else {
+ int i, zeros = 0, ones = 0, neg;
+ if (!is64) u64 = (int64_t)(int32_t)u64; /* Sign-extend. */
+ /* Count homogeneous 16 bit fragments. */
+ for (i = 0; i < 4; i++) {
+ uint64_t frag = (u64 >> i*16) & 0xffff;
+ zeros += (frag == 0);
+ ones += (frag == 0xffff);
+ }
+ neg = ones > zeros; /* Use MOVN if it pays off. */
+ if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
+ int shift = 0, lshift = 0;
+ uint64_t n64 = neg ? ~u64 : u64;
+ if (n64 != 0) {
+ /* Find first/last fragment to be filled. */
+ shift = (63-emit_clz64(n64)) & ~15;
+ lshift = emit_ctz64(n64) & ~15;
+ }
+ /* MOVK requires the original value (u64). */
+ while (shift > lshift) {
+ uint32_t u16 = (u64 >> shift) & 0xffff;
+ /* Skip fragments that are correctly filled by MOVN/MOVZ. */
+ if (u16 != (neg ? 0xffff : 0))
+ emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
+ shift -= 16;
+ }
+ /* But MOVN needs an inverted value (n64). */
+ emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
+ A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
+ }
+ }
+}
+
+/* Load a 32 bit constant into a GPR. */
+#define emit_loadi(as, rd, i) emit_loadk(as, rd, i, 0)
+
+/* Load a 64 bit constant into a GPR. */
+#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X)
+
+#define emit_loada(as, r, addr) emit_loadu64(as, (r), (uintptr_t)(addr))
+
+#define glofs(as, k) \
+ ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
+#define mcpofs(as, k) \
+ ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
+#define checkmcpofs(as, k) \
+ ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0)
+
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+
+/* Get/set from constant pointer. */
+static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
+{
+ /* First, check if ip + offset is in range. */
+ if ((ai & 0x00400000) && checkmcpofs(as, p)) {
+ emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
+ } else {
+ Reg base = RID_GL; /* Next, try GL + offset. */
+ int64_t ofs = glofs(as, p);
+ if (!emit_checkofs(ai, ofs)) { /* Else split up into base reg + offset. */
+ int64_t i64 = i64ptr(p);
+ base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
+ ofs = i64 & 0x7fffull;
+ }
+ emit_lso(as, ai, r, base, ofs);
+ }
+}
+
+/* Load 64 bit IR constant into register. */
+static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
+{
+ const uint64_t *k = &ir_k64(ir)->u64;
+ int64_t ofs;
+ if (r >= RID_MAX_GPR) {
+ uint32_t fpk = emit_isfpk64(*k);
+ if (fpk != ~0u) {
+ emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
+ return;
+ }
+ }
+ ofs = glofs(as, k);
+ if (emit_checkofs(A64I_LDRx, ofs)) {
+ emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
+ (r & 31), RID_GL, ofs);
+ } else {
+ if (r >= RID_MAX_GPR) {
+ emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
+ r = RID_TMP;
+ }
+ if (checkmcpofs(as, k))
+ emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
+ else
+ emit_loadu64(as, r, *k);
+ }
+}
+
+/* Get/set global_State fields. */
+#define emit_getgl(as, r, field) \
+ emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
+#define emit_setgl(as, r, field) \
+ emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
+
+/* Trace number is determined from pc of exit instruction. */
+#define emit_setvmstate(as, i) UNUSED(i)
+
+/* -- Emit control-flow instructions -------------------------------------- */
+
+/* Label for internal jumps. */
+typedef MCode *MCLabel;
+
+/* Return label pointing to current PC. */
+#define emit_label(as) ((as)->mcp)
+
+static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
+{
+ MCode *p = --as->mcp;
+ ptrdiff_t delta = target - p;
+ lua_assert(((delta + 0x40000) >> 19) == 0);
+ *p = A64I_BCC | A64F_S19(delta) | cond;
+}
+
+static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
+{
+ MCode *p = --as->mcp;
+ ptrdiff_t delta = target - p;
+ lua_assert(((delta + 0x02000000) >> 26) == 0);
+ *p = ai | ((uint32_t)delta & 0x03ffffffu);
+}
+
+static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target)
+{
+ MCode *p = --as->mcp;
+ ptrdiff_t delta = target - p;
+ lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0);
+ if (bit > 31) ai |= A64I_X;
+ *p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r;
+}
+
+static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target)
+{
+ MCode *p = --as->mcp;
+ ptrdiff_t delta = target - p;
+ lua_assert(((delta + 0x40000) >> 19) == 0);
+ *p = ai | A64F_S19(delta) | r;
+}
+
+#define emit_jmp(as, target) emit_branch(as, A64I_B, (target))
+
+static void emit_call(ASMState *as, void *target)
+{
+ MCode *p = --as->mcp;
+ ptrdiff_t delta = (char *)target - (char *)p;
+ if ((((delta>>2) + 0x02000000) >> 26) == 0) {
+ *p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu);
+ } else { /* Target out of range: need indirect call. But don't use R0-R7. */
+ Reg r = ra_allock(as, i64ptr(target),
+ RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+ *p = A64I_BLR | A64F_N(r);
+ }
+}
+
+/* -- Emit generic operations --------------------------------------------- */
+
+/* Generic move between two regs. */
+static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
+{
+ if (dst >= RID_MAX_GPR) {
+ emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
+ (dst & 31), (src & 31));
+ return;
+ }
+ if (as->mcp != as->mcloop) { /* Swap early registers for loads/stores. */
+ MCode ins = *as->mcp, swp = (src^dst);
+ if ((ins & 0xbf800000) == 0xb9000000) {
+ if (!((ins ^ (dst << 5)) & 0x000003e0))
+ *as->mcp = ins ^ (swp << 5); /* Swap N in load/store. */
+ if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
+ *as->mcp = ins ^ swp; /* Swap D in store. */
+ }
+ }
+ emit_dm(as, A64I_MOVx, dst, src);
+}
+
+/* Generic load of register with base and (small) offset address. */
+static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+ if (r >= RID_MAX_GPR)
+ emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
+ else
+ emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
+}
+
+/* Generic store of register with base and (small) offset address. */
+static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+ if (r >= RID_MAX_GPR)
+ emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
+ else
+ emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
+}
+
+/* Emit an arithmetic operation with a constant operand. */
+static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
+ int32_t i, RegSet allow)
+{
+ uint32_t k = emit_isk12(i);
+ if (k)
+ emit_dn(as, ai^k, dest, src);
+ else
+ emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
+}
+
+/* Add offset to pointer. */
+static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
+{
+ if (ofs)
+ emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
+ ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
+}
+
+#define emit_spsub(as, ofs) emit_addptr(as, RID_SP, -(ofs))
+