/* vim: ts=4 sw=4 sts=4 et tw=78
 * Portions copyright (c) 2015-present, Facebook, Inc. All rights reserved.
 * Portions copyright (c) 2011 James R. McKaskill.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree. An additional grant
 * of patent rights can be found in the PATENTS file in the same directory.
 */
|.if X64
|.arch x64
|.else
|.arch x86
|.endif

|.actionlist build_actionlist
|.globalnames globnames
|.externnames extnames

|.if not X64
|.define RET_H, edx // for int64_t returns
|.define RET_L, eax
|.endif

|.if X64
|.define L_ARG, r12
|.define TOP, r13
|.else
|.define L_ARG, rdi
|.define TOP, rsi
|.endif

|.if X64WIN
|
|.macro call_rrrp, func, arg0, arg1, arg2, arg3
| mov64 r9, arg3
| mov r8, arg2
| mov rdx, arg1
| mov rcx, arg0
| call func
|.endmacro
|.macro call_rrrr, func, arg0, arg1, arg2, arg3
| mov r9, arg3
| mov r8, arg2
| mov rdx, arg1
| mov rcx, arg0
| call func
|.endmacro
|
|.macro call_rrp, func, arg0, arg1, arg2
| mov64 r8, arg2
| mov rdx, arg1
| mov rcx, arg0
| call func
|.endmacro
|.macro call_rrr, func, arg0, arg1, arg2
| mov r8, arg2
| mov rdx, arg1
| mov rcx, arg0
| call func
|.endmacro
|
|.macro call_rp, func, arg0, arg1
| mov64 rdx, arg1
| mov rcx, arg0
| call func
|.endmacro
|.macro call_rr, func, arg0, arg1
| mov rdx, arg1
| mov rcx, arg0
| call func
|.endmacro
|
|.macro call_r, func, arg0
| mov rcx, arg0
| call func
|.endmacro
|
|.elif X64
|
| // the 5 and 6 arg forms are only used on posix x64
|.macro call_rrrrrr, func, arg0, arg1, arg2, arg3, arg4, arg5
| mov r9, arg5
| mov r8, arg4
| mov rcx, arg3
| mov rdx, arg2
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|.macro call_rrrrr, func, arg0, arg1, arg2, arg3, arg4
| mov r8, arg4
| mov rcx, arg3
| mov rdx, arg2
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|
|.macro call_rrrp, func, arg0, arg1, arg2, arg3
| mov64 rcx, arg3
| mov rdx, arg2
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|.macro call_rrrr, func, arg0, arg1, arg2, arg3
| mov rcx, arg3
| mov rdx, arg2
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|
|.macro call_rrp, func, arg0, arg1, arg2
| mov64 rdx, arg2
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|.macro call_rrr, func, arg0, arg1, arg2
| mov rdx, arg2
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|
|.macro call_rp, func, arg0, arg1
| mov64 rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|.macro call_rr, func, arg0, arg1
| mov rsi, arg1
| mov rdi, arg0
| call func
|.endmacro
|
|.macro call_r, func, arg0
| mov rdi, arg0
| call func
|.endmacro
|
|.else
| // define the 64bit registers to the 32 bit counterparts, so the common
| // code can use r*x for all pointers
|.define rax, eax
|.define rcx, ecx
|.define rdx, edx
|.define rsp, esp
|.define rbp, ebp
|.define rdi, edi
|.define rsi, esi
|.define mov64, mov
|
|.macro call_rrrr, func, arg0, arg1, arg2, arg3
| mov dword [rsp+12], arg3
| mov dword [rsp+8], arg2
| mov dword [rsp+4], arg1
| mov dword [rsp], arg0
| call func
|.endmacro
|.macro call_rrr, func, arg0, arg1, arg2
| mov dword [rsp+8], arg2
| mov dword [rsp+4], arg1
| mov dword [rsp], arg0
| call func
|.endmacro
|.macro call_rr, func, arg0, arg1
| mov dword [rsp+4], arg1
| mov dword [rsp], arg0
| call func
|.endmacro
|.macro call_r, func, arg0
| mov dword [rsp], arg0
| call func
|.endmacro
|
|.define call_rrrp, call_rrrr
|.define call_rrp, call_rrr
|.define call_rp, call_rr
|
|.endif

|.macro epilog
|.if X64
| mov TOP, [rbp-16]
| mov L_ARG, [rbp-8]
|.else
| mov TOP, [rbp-8]
| mov L_ARG, [rbp-4]
|.endif
| mov rsp, rbp
| pop rbp
| ret
|.endmacro

|.macro get_errno // note trashes registers
| call extern GetLastError
| mov64 rcx, perr
| mov dword [rcx], eax
|.endmacro

|.macro too_few_arguments
| mov ax, 0
| call_rp extern luaL_error, L_ARG, &"too few arguments"
|.endmacro

|.macro too_many_arguments
| mov ax, 0
| call_rp extern luaL_error, L_ARG, &"too many arguments"
|.endmacro

|.macro lua_return_arg
| mov eax, 1
| epilog
|.endmacro

|.macro lua_return_void
| get_errno
| mov eax, 0
| epilog
|.endmacro

|.macro lua_return_double
|.if X64
| movq qword [rsp+32], xmm0
|.else
| fstp qword [rsp+4] // note get_errno doesn't require any stack on x86
|.endif
|
| get_errno
|
|.if X64WIN
| movq xmm1, qword [rsp+32]
| mov rcx, L_ARG
|.elif X64
| movq xmm0, qword [rsp+32]
| mov rdi, L_ARG
|.else
| mov [rsp], L_ARG
|.endif
| call extern lua_pushnumber
| lua_return_arg
|.endmacro

|.macro lua_return_bool
| movzx eax, al
| mov [rsp+32], eax
| get_errno
| mov eax, [rsp+32]
| call_rr extern lua_pushboolean, L_ARG, rax
| lua_return_arg
|.endmacro

|.macro lua_return_int
| mov [rsp+32], eax
| get_errno
| mov eax, [rsp+32]
| call_rr extern push_int, L_ARG, rax
| lua_return_arg
|.endmacro

|.macro lua_return_uint
| mov [rsp+32], eax
| get_errno
| mov eax, [rsp+32]
| call_rr extern push_uint, L_ARG, rax
| lua_return_arg
|.endmacro

|.macro lua_return_long
| mov [rsp+32], rax
| get_errno
| mov rax, [rsp+32]
| call_rr extern lua_pushinteger, L_ARG, rax
| lua_return_arg
|.endmacro

|.macro lua_return_ulong
| mov [rsp+32], rax
| get_errno
| mov rax, [rsp+32]
| call_rr extern lua_pushinteger, L_ARG, rax
| lua_return_arg
|.endmacro

|.macro save_registers
| // use rbp relative so we store values in the outer stack frame
|.if X64WIN
| // use the provided shadow space for int registers above prev rbp and
| // return address
| mov [rbp+16], rcx
| mov [rbp+24], rdx
| mov [rbp+32], r8
| mov [rbp+40], r9
| // use the extra space we added for float registers
| // -16 to store underneath previous value of L_ARG
| movq qword [rbp-16], xmm0
| movq qword [rbp-24], xmm1
| movq qword [rbp-32], xmm2
| movq qword [rbp-40], xmm3
|.elif X64
| movq qword [rbp-16], xmm0
| movq qword [rbp-24], xmm1
| movq qword [rbp-32], xmm2
| movq qword [rbp-40], xmm3
| movq qword [rbp-48], xmm4
| movq qword [rbp-56], xmm5
| movq qword [rbp-64], xmm6
| movq qword [rbp-72], xmm7
| mov [rbp-80], rdi
| mov [rbp-88], rsi
| mov [rbp-96], rdx
| mov [rbp-104], rcx
| mov [rbp-112], r8
| mov [rbp-120], r9
|.else
| // fastcall, -8 to store underneath previous value of L_ARG
| mov [rbp-8], ecx
| mov [rbp-12], edx
|.endif
|.endmacro

#if defined _WIN64 || defined __amd64__
#define JUMP_SIZE 14
#else
#define JUMP_SIZE 4
#endif

#define MIN_BRANCH INT32_MIN
#define MAX_BRANCH INT32_MAX
#define BRANCH_OFF 4

static void compile_extern_jump(struct jit* jit, lua_State* L, cfunction func, uint8_t* code)
{
    /* The jump code is the function pointer followed by a stub to call the
     * function pointer. The stub exists in 64 bit so we can jump to functions
     * with an offset greater than 2 GB.
     *
     * Note we have to manually set this up since there are commands buffered
     * in the jit state and dynasm doesn't support rip relative addressing.
     *
     * eg on 64 bit:
     * 0-8: function ptr
     * 8-14: jmp aword [rip-14]
     *
     * for 32 bit we only set the function ptr as it can always fit in a 32
     * bit displacement
     */
#if defined _WIN64 || defined __amd64__
    *(cfunction*) code = func;
    code[8] = 0xFF; /* FF /4 operand for jmp */
    code[9] = 0x25; /* RIP displacement */
    *(int32_t*) &code[10] = -14;
#else
    *(cfunction*) code = func;
#endif
}

void compile_globals(struct jit* jit, lua_State* L)
{
    struct jit* Dst = jit;
    int* perr = &jit->last_errno;
    dasm_setup(Dst, build_actionlist);

    /* Note: since the return code uses EBP to reset the stack pointer, we
     * don't have to track the amount of stack space used. It also means we
     * can handle stdcall and cdecl with the same code.
     */

    /* Note the various call_* functions want 32 bytes of 16 byte aligned
     * stack
     */


    compile(Dst, L, NULL, LUA_NOREF);
}

int x86_return_size(lua_State* L, int usr, const struct ctype* ct)
{
    int ret = 0;
    const struct ctype* mt;

    if (ct->calling_convention != C_CALL) {
        size_t i;
        size_t argn = lua_rawlen(L, usr);
        for (i = 1; i <= argn; i++) {
            lua_rawgeti(L, usr, (int) i);
            mt = (const struct ctype*) lua_touserdata(L, -1);

            if (mt->pointers || mt->is_reference) {
                ret += sizeof(void*);
            } else {
                switch (mt->type) {
                case DOUBLE_TYPE:
                case COMPLEX_FLOAT_TYPE:
                case INT64_TYPE:
                    ret += 8;
                    break;
                case COMPLEX_DOUBLE_TYPE:
                    ret += 16;
                    break;
                case INTPTR_TYPE:
                    ret += sizeof(intptr_t);
                    break;
                case FUNCTION_PTR_TYPE:
                    ret += sizeof(cfunction);
                    break;
                case BOOL_TYPE:
                case FLOAT_TYPE:
                case INT8_TYPE:
                case INT16_TYPE:
                case INT32_TYPE:
                case ENUM_TYPE:
                    ret += 4;
                    break;
                default:
                    return luaL_error(L, "NYI - argument type");
                }
            }

            lua_pop(L, 1);
        }
    }

#if !defined _WIN64 && !defined __amd64__
    lua_rawgeti(L, usr, 0);
    mt = (const struct ctype*) lua_touserdata(L, -1);
    if (!mt->pointers && !mt->is_reference && mt->type == COMPLEX_DOUBLE_TYPE) {
        ret += sizeof(void*);
    }
    lua_pop(L, 1);
#endif

    return ret;
}

#ifdef _WIN64
#define MAX_REGISTERS(ct) 4 /* rcx, rdx, r8, r9 */

#elif defined __amd64__
#define MAX_INT_REGISTERS(ct) 6 /* rdi, rsi, rdx, rcx, r8, r9 */
#define MAX_FLOAT_REGISTERS(ct) 8 /* xmm0-7 */

#else
#define MAX_INT_REGISTERS(ct) ((ct)->calling_convention == FAST_CALL ? 2 /* ecx, edx */ : 0)
#define MAX_FLOAT_REGISTERS(ct) 0
#endif

struct reg_alloc {
#ifdef _WIN64
    int regs;
    int is_float[4];
    int is_int[4];
#else
    int floats;
    int ints;
#endif
    int off;
};

#ifdef _WIN64
#define REGISTER_STACK_SPACE(ct) (4*8)
#elif defined __amd64__
#define REGISTER_STACK_SPACE(ct) (14*8)
#else
#define REGISTER_STACK_SPACE(ct) ALIGN_UP(((ct)->calling_convention == FAST_CALL ? 2*4 : 0), 15)
#endif

/* Fastcall:
 * Uses ecx, edx as first two int registers
 * Everything else on stack (include 64bit ints)
 * No overflow stack space
 * Pops the stack before returning
 * Returns int in eax, float in ST0
 * We use the same register allocation logic as posix x64 with 2 int regs and 0 float regs
 */

static void get_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64)
{
    /* grab the register from the shadow space */
#ifdef _WIN64
    if (reg->regs < MAX_REGISTERS(ct)) {
        | mov rcx, [rbp + 16 + 8*reg->regs]
        reg->regs++;
    }
#elif __amd64__
    if (reg->ints < MAX_INT_REGISTERS(ct)) {
        | mov rcx, [rbp - 80 - 8*reg->ints]
        reg->ints++;
    }
#else
    if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) {
        | mov ecx, [rbp - 8 - 4*reg->ints]
        reg->ints++;
    }
#endif
    else if (is_int64) {
        |.if X64
        | mov rcx, [rbp + reg->off]
        |.else
        | mov rcx, [rbp + reg->off]
        | mov rdx, [rbp + reg->off + 4]
        |.endif
        reg->off += 8;
    } else {
        | mov ecx, [rbp + reg->off]
#if defined __amd64__ || defined _WIN64
		/* The parameters to a function on stack are always 8 byte aligned. */
        reg->off += 8;
#else
        reg->off += 4;
#endif
    }
}

static void add_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64)
{
#ifdef _WIN64
    if (reg->regs < MAX_REGISTERS(ct)) {
        | mov [rsp + 32 + 8*(reg->regs)], rax
        reg->is_int[reg->regs++] = 1;
    }
#elif __amd64__
    if (reg->ints < MAX_INT_REGISTERS(ct)) {
        | mov [rsp + 32 + 8*reg->ints], rax
        reg->ints++;
    }
#else
    if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) {
        | mov [rsp + 32 + 4*reg->ints], rax
        reg->ints++;
    }
#endif
    else {
#if defined _WIN64 || defined __amd64__
        if (reg->off % 8 != 0) {
            reg->off += 8 - (reg->off % 8);
        }
#endif
        if (is_int64) {
            |.if X64
            | mov [rsp + reg->off], rax
            |.else
            | mov [rsp + reg->off], RET_L
            | mov [rsp + reg->off + 4], RET_H
            |.endif
            reg->off += 8;
        } else {
            | mov [rsp+reg->off], eax
            reg->off += 4;
        }
    }
}

static void get_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double)
{
#if !defined _WIN64 && !defined __amd64__
    assert(MAX_FLOAT_REGISTERS(ct) == 0);
    if (is_double) {
        | fld qword [rbp + reg->off]
        reg->off += 8;
    } else {
        | fld dword [rbp + reg->off]
        reg->off += 4;
    }
#else
    int off;

#ifdef _WIN64
    if (reg->regs < MAX_REGISTERS(ct)) {
        off = -16 - 8*reg->regs;
        reg->regs++;
    }
#else
    if (reg->floats < MAX_FLOAT_REGISTERS(ct)) {
        off = -16 - 8*reg->floats;
        reg->floats++;
    }
#endif
    else {
        off = reg->off;
        reg->off += is_double ? 8 : 4;
    }

    if (is_double) {
        | movq xmm0, qword [rbp + off]
    } else {
        | cvtss2sd xmm0, dword [rbp + off]
    }
#endif
}

static void add_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double)
{
#if !defined _WIN64 && !defined __amd64__
    assert(MAX_FLOAT_REGISTERS(ct) == 0);
    if (is_double) {
        | fstp qword [rsp + reg->off]
        reg->off += 8;
    } else {
        | fstp dword [rsp + reg->off]
        reg->off += 4;
    }
#else

#ifdef _WIN64
    if (reg->regs < MAX_REGISTERS(ct)) {
        if (is_double) {
            | movq qword [rsp + 32 + 8*(reg->regs)], xmm0
        } else {
            | cvtsd2ss xmm0, xmm0
            | movq qword [rsp + 32 + 8*(reg->regs)], xmm0
        }
        reg->is_float[reg->regs++] = 1;
    }
#else
    if (reg->floats < MAX_FLOAT_REGISTERS(ct)) {
        if (is_double) {
            | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0
        } else {
            | cvtsd2ss xmm0, xmm0
            | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0
        }
        reg->floats++;
    }
#endif

    else if (is_double) {
        | movq qword [rsp + reg->off], xmm0
        reg->off += 8;
    } else {
        | cvtsd2ss xmm0, xmm0
        | movd dword [rsp + reg->off], xmm0
        reg->off += 4;
    }
#endif
}

#if defined _WIN64 || defined __amd64__
#define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 1)
#define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 1)
#else
#define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 0)
#define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 0)
#endif

cfunction compile_callback(lua_State* L, int fidx, int ct_usr, const struct ctype* ct)
{
    int i, nargs;
    cfunction* pf;
    struct ctype ct2 = *ct;
    const struct ctype* mt;
    struct reg_alloc reg;
    int num_upvals = 0;
    int top = lua_gettop(L);
    struct jit* Dst = get_jit(L);
    int ref;
    int hidden_arg_off = 0;

    ct_usr = lua_absindex(L, ct_usr);
    fidx = lua_absindex(L, fidx);

    assert(lua_isnil(L, fidx) || lua_isfunction(L, fidx));

    memset(&reg, 0, sizeof(reg));
#ifdef _WIN64
    reg.off = 16 + REGISTER_STACK_SPACE(ct); /* stack registers are above the shadow space */
#elif __amd64__
    reg.off = 16;
#else
    reg.off = 8;
#endif

    dasm_setup(Dst, build_actionlist);

    // add a table to store ctype and function upvalues
    // callback_set assumes the first value is the lua function
    nargs = (int) lua_rawlen(L, ct_usr);
    lua_newtable(L);
    lua_pushvalue(L, -1);
    ref = luaL_ref(L, LUA_REGISTRYINDEX);

    if (ct->has_var_arg) {
        luaL_error(L, "can't create callbacks with varargs");
    }

    // setup a stack frame to hold args for the call into lua_call

    | push rbp
    | mov rbp, rsp
    | push L_ARG
    | // stack is 4 or 8 (mod 16) (L_ARG, rbp, rip)
    |.if X64
    | // 8 to realign, 16 for return vars, 32 for local calls, rest to save registers
    | sub rsp, 8 + 16 + 32 + REGISTER_STACK_SPACE(ct)
    | save_registers
    |.else
    | // 4 to realign, 16 for return vars, 32 for local calls, rest to save registers
    | sub rsp, 4 + 16 + 32 + REGISTER_STACK_SPACE(ct)
    if (ct->calling_convention == FAST_CALL) {
        | save_registers
    }
    |.endif

    // hardcode the lua_State* value into the assembly
    | mov64 L_ARG, L

    /* get the upval table */
    | call_rrr extern lua_rawgeti, L_ARG, LUA_REGISTRYINDEX, ref

    /* get the lua function */
    lua_pushvalue(L, fidx);
    lua_rawseti(L, -2, ++num_upvals);
    assert(num_upvals == CALLBACK_FUNC_USR_IDX);
    | call_rrr extern lua_rawgeti, L_ARG, -1, num_upvals

#if !defined _WIN64 && !defined __amd64__
    lua_rawgeti(L, ct_usr, 0);
    mt = (const struct ctype*) lua_touserdata(L, -1);
    if (!mt->pointers && !mt->is_reference && mt->type == COMPLEX_DOUBLE_TYPE) {
        hidden_arg_off = reg.off;
        reg.off += sizeof(void*);
    }
    lua_pop(L, 1);
#else
    (void) hidden_arg_off;
#endif

    for (i = 1; i <= nargs; i++) {
        lua_rawgeti(L, ct_usr, i);
        mt = (const struct ctype*) lua_touserdata(L, -1);

        if (mt->pointers || mt->is_reference) {
            lua_getuservalue(L, -1);
            lua_rawseti(L, -3, ++num_upvals); /* usr value */
            lua_rawseti(L, -2, ++num_upvals); /* mt */
            /* on the lua stack in the callback:
             * upval tbl, lua func, i-1 args
             */
            | call_rrr extern lua_rawgeti, L_ARG, -i-1, num_upvals-1
            | call_rrp extern push_cdata, L_ARG, -1, mt
            get_pointer(Dst, ct, &reg);
            | mov [rax], rcx
            | call_rr, extern lua_remove, L_ARG, -2
        } else {
            switch (mt->type) {
            case INT64_TYPE:
                lua_getuservalue(L, -1);
                lua_rawseti(L, -3, ++num_upvals); /* mt */
                lua_pop(L, 1);
                | call_rrp extern push_cdata, L_ARG, 0, mt
                get_int(Dst, ct, &reg, 1);
                |.if X64
                | mov [rax], rcx
                |.else
                | mov [rax], ecx
                | mov [rax+4], edx
                |.endif
                break;

            case INTPTR_TYPE:
                lua_getuservalue(L, -1);
                lua_rawseti(L, -3, ++num_upvals); /* mt */
                lua_pop(L, 1);
                | call_rrp extern push_cdata, L_ARG, 0, mt
                get_pointer(Dst, ct, &reg);
                | mov [rax], rcx
                break;

            case COMPLEX_FLOAT_TYPE:
                lua_pop(L, 1);
#if defined _WIN64 || defined __amd64__
                /* complex floats are two floats packed into a double */
                | call_rrp extern push_cdata, L_ARG, 0, mt
                get_float(Dst, ct, &reg, 1);
                | movq qword [rax], xmm0
#else
                /* complex floats are real followed by imag on the stack */
                | call_rrp extern push_cdata, L_ARG, 0, mt
                get_float(Dst, ct, &reg, 0);
                | fstp dword [rax]
                get_float(Dst, ct, &reg, 0);
                | fstp dword [rax+4]
#endif
                break;

            case COMPLEX_DOUBLE_TYPE:
                lua_pop(L, 1);
                | call_rrp extern push_cdata, L_ARG, 0, mt
                /* real */
                get_float(Dst, ct, &reg, 1);
                |.if X64
                | movq qword [rax], xmm0
                |.else
                | fstp qword [rax]
                |.endif
                /* imag */
                get_float(Dst, ct, &reg, 1);
                |.if X64
                | movq qword [rax+8], xmm0
                |.else
                | fstp qword [rax+8]
                |.endif
                break;

            case FLOAT_TYPE:
            case DOUBLE_TYPE:
                lua_pop(L, 1);
                get_float(Dst, ct, &reg, mt->type == DOUBLE_TYPE);
                |.if X64WIN
                | movq xmm1, xmm0
                | mov rcx, L_ARG
                |.elif X64
                | // for 64bit xmm0 is already set
                | mov rdi, L_ARG
                |.else
                | fstp qword [rsp+4]
                | mov [rsp], L_ARG
                |.endif
                | call extern lua_pushnumber
                break;

            case BOOL_TYPE:
                lua_pop(L, 1);
                get_int(Dst, ct, &reg, 0);
                | movzx ecx, cl
                | call_rr extern lua_pushboolean, L_ARG, rcx
                break;

            case INT8_TYPE:
                lua_pop(L, 1);
                get_int(Dst, ct, &reg, 0);
                if (mt->is_unsigned) {
                    | movzx ecx, cl
                } else {
                    | movsx ecx, cl
                }
                | call_rr extern push_int, L_ARG, rcx
                break;

            case INT16_TYPE:
                lua_pop(L, 1);
                get_int(Dst, ct, &reg, 0);
                if (mt->is_unsigned) {
                    | movzx ecx, cx
                } else {
                    | movsx ecx, cx
                }
                | call_rr extern push_int, L_ARG, rcx
                break;

            case ENUM_TYPE:
            case INT32_TYPE:
                lua_pop(L, 1);
                get_int(Dst, ct, &reg, 0);
                if (mt->is_unsigned) {
                    | call_rr extern push_uint, L_ARG, rcx
                } else {
                    | call_rr extern push_int, L_ARG, rcx
                }
                break;

            default:
                luaL_error(L, "NYI: callback arg type");
            }
        }
    }

    lua_rawgeti(L, ct_usr, 0);
    mt = (const struct ctype*) lua_touserdata(L, -1);

    | call_rrrp extern lua_callk, L_ARG, nargs, (mt->pointers || mt->is_reference || mt->type != VOID_TYPE) ? 1 : 0, 0

    // Unpack the return argument if not "void", also clean-up the lua stack
    // to remove the return argument and bind table. Use lua_settop rather
    // than lua_pop as lua_pop is implemented as a macro.
    if (mt->pointers || mt->is_reference) {
        lua_getuservalue(L, -1);
        lua_rawseti(L, -3, ++num_upvals); /* usr value */
        lua_rawseti(L, -2, ++num_upvals); /* mt */
        | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1
        | call_rrrp extern check_typed_pointer, L_ARG, -2, -1, mt
        | mov [rsp+32], rax
        | call_rr extern lua_settop, L_ARG, -4
        | mov rax, [rsp+32]

    } else {
        switch (mt->type) {
        case ENUM_TYPE:
            lua_getuservalue(L, -1);
            lua_rawseti(L, -3, ++num_upvals); /* usr value */
            lua_rawseti(L, -2, ++num_upvals); /* mt */
            | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1
            | call_rrrp, extern check_enum, L_ARG, -2, -1, mt
            | mov [rsp+32], eax
            | call_rr extern lua_settop, L_ARG, -4
            | mov eax, [rsp+32]
            break;

        case VOID_TYPE:
            lua_pop(L, 1);
            | call_rr extern lua_settop, L_ARG, -2
            break;

        case BOOL_TYPE:
        case INT8_TYPE:
        case INT16_TYPE:
        case INT32_TYPE:
            lua_pop(L, 1);
            if (mt->is_unsigned) {
                | call_rr extern check_uint32, L_ARG, -1
            } else {
                | call_rr extern check_int32, L_ARG, -1
            }
            | mov [rsp+32], eax
            | call_rr extern lua_settop, L_ARG, -3
            | mov eax, [rsp+32]
            break;

        case INT64_TYPE:
            lua_pop(L, 1);

            if (mt->is_unsigned) {
                | call_rr extern check_uint64, L_ARG, -1
            } else {
                | call_rr extern check_int64, L_ARG, -1
            }

            |.if X64
            | mov [rsp+32], rax
            |.else
            | mov [rsp+32], RET_L
            | mov [rsp+36], RET_H
            |.endif
            | call_rr extern lua_settop, L_ARG, -3
            |.if X64
            | mov rax, [rsp+32]
            |.else
            | mov RET_L, [rsp+32]
            | mov RET_H, [rsp+36]
            |.endif
            break;

        case INTPTR_TYPE:
            lua_pop(L, 1);
            | call_rr extern check_uintptr, L_ARG, -1
            | mov [rsp+32], rax
            | call_rr extern lua_settop, L_ARG, -3
            | mov rax, [rsp+32]
            break;

        case FLOAT_TYPE:
        case DOUBLE_TYPE:
            lua_pop(L, 1);
            | call_rr extern check_double, L_ARG, -1
            |.if X64
            | movq qword [rsp+32], xmm0
            | call_rr extern lua_settop, L_ARG, -3
            if (mt->type == FLOAT_TYPE) {
                | cvtsd2ss xmm0, qword [rsp+32]
            } else {
                | movq xmm0, qword [rsp+32]
            }
            |.else
            | fstp qword [rsp+32]
            | call_rr extern lua_settop, L_ARG, -3
            | fld qword [rsp+32]
            |.endif
            break;

        case COMPLEX_FLOAT_TYPE:
            lua_pop(L, 1);
#if !defined HAVE_COMPLEX
            luaL_error(L, "ffi lib compiled without complex number support");
#endif
            /* on 64 bit complex floats are two floats packed into a double,
             * on 32 bit returned complex floats use eax and edx */
            | call_rr extern check_complex_float, L_ARG, -1
            |
            |.if X64
            | movq qword [rsp+32], xmm0
            |.else
            | mov [rsp+32], eax
            | mov [rsp+36], edx
            |.endif
            |
            | call_rr extern lua_settop, L_ARG, -3
            |
            |.if X64
            | movq xmm0, qword [rsp+32]
            |.else
            | mov eax, [rsp+32]
            | mov edx, [rsp+36]
            |.endif
            break;

        case COMPLEX_DOUBLE_TYPE:
            lua_pop(L, 1);
#if !defined HAVE_COMPLEX
            luaL_error(L, "ffi lib compiled without complex number support");
#endif
            /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit
             * there is a hidden first parameter that points to 16 bytes where
             * the returned arg is stored which is popped by the called
             * function */
#if defined _WIN64 || defined __amd64__
            | call_rr extern check_complex_double, L_ARG, -1
            | movq qword [rsp+32], xmm0
            | movq qword [rsp+40], xmm1
            | call_rr extern lua_settop, L_ARG, -3
            | movq xmm0, qword [rsp+32]
            | movq xmm1, qword [rsp+40]
#else
            | mov rcx, [rbp + hidden_arg_off]
            | call_rrr extern check_complex_double, rcx, L_ARG, -1
            | sub rsp, 4 // to realign from popped hidden arg
            | call_rr extern lua_settop, L_ARG, -3
#endif
            break;

        default:
            luaL_error(L, "NYI: callback return type");
        }
    }

    |.if X64
    | mov L_ARG, [rbp-8]
    |.else
    | mov L_ARG, [rbp-4]
    |.endif
    | mov rsp, rbp
    | pop rbp
    | ret x86_return_size(L, ct_usr, ct)

    lua_pop(L, 1); /* upval table - already in registry */
    assert(lua_gettop(L) == top);

    ct2.is_jitted = 1;
    pf = (cfunction*) push_cdata(L, ct_usr, &ct2);
    *pf = compile(Dst, L, NULL, ref);

    assert(lua_gettop(L) == top + 1);

    return *pf;
}

void compile_function(lua_State* L, cfunction func, int ct_usr, const struct ctype* ct)
{
    size_t i, nargs;
    int num_upvals;
    const struct ctype* mbr_ct;
    struct jit* Dst = get_jit(L);
    struct reg_alloc reg;
    void* p;
    int top = lua_gettop(L);
    int* perr = &Dst->last_errno;

    ct_usr = lua_absindex(L, ct_usr);

    memset(&reg, 0, sizeof(reg));
    reg.off = 32 + REGISTER_STACK_SPACE(ct);

    dasm_setup(Dst, build_actionlist);

    p = push_cdata(L, ct_usr, ct);
    *(cfunction*) p = func;
    num_upvals = 1;

    nargs = lua_rawlen(L, ct_usr);

    if (ct->calling_convention != C_CALL && ct->has_var_arg) {
        luaL_error(L, "vararg is only allowed with the c calling convention");
    }

    | push rbp
    | mov rbp, rsp
    | push L_ARG
    | push TOP
    | // stack is 0 (mod 16) (TOP, L_ARG, rbp, rip)
    |
    | // Get L from our arguments and allocate some stack for lua_gettop
    |.if X64WIN
    | mov L_ARG, rcx
    | sub rsp, 32 // shadow space
    |.elif X64
    | mov L_ARG, rdi
    |.else
    | mov L_ARG, [rbp + 8]
    | sub rsp, 16
    |.endif
    |
    | call_r extern lua_gettop, L_ARG
    | mov TOP, rax // no need for movzxd rax, eax - high word guarenteed to be zero by x86-64
    | cmp rax, nargs
    if (!ct->has_var_arg) {
        | jge >2
        | too_few_arguments
        | 2:
        | jle >1
        | too_many_arguments
    } else {
        | jge >1
        | too_few_arguments
    }

    | 1:

    /* no need to zero extend eax returned by lua_gettop to rax as x86-64
     * preguarentees that the upper 32 bits will be zero */
    | shl rax, 4 // reserve 16 bytes per argument - this maintains the alignment mod 16
    | sub rsp, rax
    | sub rsp, 32 + REGISTER_STACK_SPACE(ct) // reserve an extra 32 to call local functions

#if !defined _WIN64 && !defined __amd64__
    /* Returned complex doubles require a hidden first parameter where the
     * data is stored, which is popped by the calling code. */
    lua_rawgeti(L, ct_usr, 0);
    mbr_ct = (const struct ctype*) lua_touserdata(L, -1);
    if (!mbr_ct->pointers && !mbr_ct->is_reference && mbr_ct->type == COMPLEX_DOUBLE_TYPE) {
        /* we can allocate more space for arguments as long as no add_*
         * function has been called yet, mbr_ct will be added as an upvalue in
         * the return processing later */
        | call_rrp extern push_cdata, L_ARG, 0, mbr_ct
        | sub rsp, 16
        add_pointer(Dst, ct, &reg);
    }
    lua_pop(L, 1);
#endif

    for (i = 1; i <= nargs; i++) {
        lua_rawgeti(L, ct_usr, (int) i);
        mbr_ct = (const struct ctype*) lua_touserdata(L, -1);

        if (mbr_ct->pointers || mbr_ct->is_reference) {
            lua_getuservalue(L, -1);
            num_upvals += 2;
            | call_rrrp extern check_typed_pointer, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct
            add_pointer(Dst, ct, &reg);
        } else {
            switch (mbr_ct->type) {
            case FUNCTION_PTR_TYPE:
                lua_getuservalue(L, -1);
                num_upvals += 2;
                | call_rrrp extern check_typed_cfunction, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct
                add_pointer(Dst, ct, &reg);
                break;

            case ENUM_TYPE:
                lua_getuservalue(L, -1);
                num_upvals += 2;
                | call_rrrp, extern check_enum, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct
                add_int(Dst, ct, &reg, 0);
                break;

            case INT8_TYPE:
                | call_rr extern check_int32, L_ARG, i
                if (mbr_ct->is_unsigned) {
                    | movzx eax, al
                } else {
                    | movsx eax, al
                }
                add_int(Dst, ct, &reg, 0);
                lua_pop(L, 1);
                break;

            case INT16_TYPE:
                | call_rr extern check_int32, L_ARG, i
                if (mbr_ct->is_unsigned) {
                    | movzx eax, ax
                } else {
                    | movsx eax, ax
                }
                add_int(Dst, ct, &reg, 0);
                lua_pop(L, 1);
                break;

            case BOOL_TYPE:
                | call_rr extern check_int32, L_ARG, i
                | cmp eax, 0
                | setne al
                | movzx eax, al
                add_int(Dst, ct, &reg, 0);
                lua_pop(L, 1);
                break;

            case INT32_TYPE:
                if (mbr_ct->is_unsigned) {
                    | call_rr extern check_uint32, L_ARG, i
                } else {
                    | call_rr extern check_int32, L_ARG, i
                }
                add_int(Dst, ct, &reg, 0);
                lua_pop(L, 1);
                break;

            case INTPTR_TYPE:
                | call_rr extern check_uintptr, L_ARG, i
                add_pointer(Dst, ct, &reg);
                lua_pop(L, 1);
                break;

            case INT64_TYPE:
                if (mbr_ct->is_unsigned) {
                    | call_rr extern check_uint64, L_ARG, i
                } else {
                    | call_rr extern check_int64, L_ARG, i
                }
                add_int(Dst, ct, &reg, 1);
                lua_pop(L, 1);
                break;

            case DOUBLE_TYPE:
                | call_rr extern check_double, L_ARG, i
                add_float(Dst, ct, &reg, 1);
                lua_pop(L, 1);
                break;

            case COMPLEX_DOUBLE_TYPE:
                /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit
                 * there is a hidden first parameter that points to 16 bytes where
                 * the returned arg is stored (this is popped by the called
                 * function) */
#if defined _WIN64 || defined __amd64__
                | call_rr extern check_complex_double, L_ARG, i
                add_float(Dst, ct, &reg, 1);
                | movq xmm0, xmm1
                add_float(Dst, ct, &reg, 1);
#else
                | lea rax, [rsp+reg.off]
                | sub rsp, 4
                | call_rrr extern check_complex_double, rax, L_ARG, i
                reg.off += 16;
#endif
                lua_pop(L, 1);
                break;

            case FLOAT_TYPE:
                | call_rr extern check_double, L_ARG, i
                add_float(Dst, ct, &reg, 0);
                lua_pop(L, 1);
                break;

            case COMPLEX_FLOAT_TYPE:
#if defined _WIN64 || defined __amd64__
                | call_rr extern check_complex_float, L_ARG, i
                /* complex floats are two floats packed into a double */
                add_float(Dst, ct, &reg, 1);
#else
                /* returned complex floats use eax and edx */
                | call_rr extern check_complex_float, L_ARG, i
                | mov [rsp], eax
                | fld dword [rsp]
                add_float(Dst, ct, &reg, 0);
                | mov [rsp], edx
                | fld dword [rsp]
                add_float(Dst, ct, &reg, 0);
#endif
                lua_pop(L, 1);
                break;

            default:
                luaL_error(L, "NYI: call arg type");
            }
        }
    }

    if (ct->has_var_arg) {
#ifdef _WIN64
        |.if X64WIN
        if (reg.regs < MAX_REGISTERS(ct)) {
            assert(reg.regs == nargs);
            | cmp TOP, MAX_REGISTERS(ct)
            | jle >1
            | // unpack onto stack
            | mov rax, rsp
            | add rax, 32 + 8*MAX_REGISTERS(ct)
            | call_rrrr extern unpack_varargs_stack, L_ARG, MAX_REGISTERS(ct)+1, TOP, rax
            | // unpack to registers
            | mov rax, rsp
            | add rax, 32 + 8*(reg.regs)
            | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, MAX_REGISTERS(ct), rax
            | jmp >2
            |1:
            | // unpack just to registers
            | mov rax, rsp
            | add rax, 32 + 8*(reg.regs)
            | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, TOP, rax
            |2:
        } else {
            | // unpack just to stack
            | mov rax, rsp
            | add rax, reg.off
            | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax
        }

        for (i = nargs; i < MAX_REGISTERS(ct); i++) {
            reg.is_int[i] = reg.is_float[i] = 1;
        }
        reg.regs = MAX_REGISTERS(ct);
#elif defined __amd64__
        |.elif X64
        if (reg.floats < MAX_FLOAT_REGISTERS(ct)) {
            | mov rax, rsp
            | add rax, 32 + 8*(MAX_INT_REGISTERS(ct) + reg.floats)
            | call_rrrrr extern unpack_varargs_float, L_ARG, nargs+1, TOP, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax
        }

        if (reg.ints < MAX_INT_REGISTERS(ct)) {
            | mov rax, rsp
            | add rax, 32 + 8*(reg.ints)
            | call_rrrrr extern unpack_varargs_int, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, rax
        }

        | mov rax, rsp
        | add rax, reg.off
        | call_rrrrrr extern unpack_varargs_stack_skip, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax

        reg.floats = MAX_FLOAT_REGISTERS(ct);
        reg.ints = MAX_INT_REGISTERS(ct);
#else
        |.else
        | mov rax, rsp
        | add rax, reg.off
        | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax
        |.endif
#endif
    }

    | mov64 rcx, perr
    | mov eax, dword [rcx]
    | call_r extern SetLastError, rax

    /* remove the stack space to call local functions */
    |.if X32WIN
    | add rsp, 28 // SetLastError will have already popped 4
    |.else
    | add rsp, 32
    |.endif

#ifdef _WIN64
    |.if X64WIN
    switch (reg.regs) {
    case 4:
        if (reg.is_float[3]) {
            | movq xmm3, qword [rsp + 8*3]
        }
        if (reg.is_int[3]) {
            | mov r9, [rsp + 8*3]
        }
    case 3:
        if (reg.is_float[2]) {
            | movq xmm2, qword [rsp + 8*2]
        }
        if (reg.is_int[2]) {
            | mov r8, [rsp + 8*2]
        }
    case 2:
        if (reg.is_float[1]) {
            | movq xmm1, qword [rsp + 8*1]
        }
        if (reg.is_int[1]) {
            | mov rdx, [rsp + 8*1]
        }
    case 1:
        if (reg.is_float[0]) {
            | movq xmm0, qword [rsp]
        }
        if (reg.is_int[0]) {
            | mov rcx, [rsp]
        }
    case 0:
        break;
    }

    /* don't remove the space for the registers as we need 32 bytes of register overflow space */
    assert(REGISTER_STACK_SPACE(ct) == 32);

#elif defined __amd64__
    |.elif X64
    switch (reg.floats) {
    case 8:
        | movq xmm7, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+7)]
    case 7:
        | movq xmm6, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+6)]
    case 6:
        | movq xmm5, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+5)]
    case 5:
        | movq xmm4, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+4)]
    case 4:
        | movq xmm3, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+3)]
    case 3:
        | movq xmm2, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+2)]
    case 2:
        | movq xmm1, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+1)]
    case 1:
        | movq xmm0, qword [rsp + 8*(MAX_INT_REGISTERS(ct))]
    case 0:
        break;
    }

    switch (reg.ints) {
    case 6:
        | mov r9, [rsp + 8*5]
    case 5:
        | mov r8, [rsp + 8*4]
    case 4:
        | mov rcx, [rsp + 8*3]
    case 3:
        | mov rdx, [rsp + 8*2]
    case 2:
        | mov rsi, [rsp + 8*1]
    case 1:
        | mov rdi, [rsp]
    case 0:
        break;
    }

    | add rsp, REGISTER_STACK_SPACE(ct)
#else
    |.else
    if (ct->calling_convention == FAST_CALL) {
        switch (reg.ints) {
        case 2:
            | mov edx, [rsp + 4]
        case 1:
            | mov ecx, [rsp]
        case 0:
            break;
        }

        | add rsp, REGISTER_STACK_SPACE(ct)
    }
    |.endif
#endif

#ifdef __amd64__
    if (ct->has_var_arg) {
        /* al stores an upper limit on the number of float register, note that
         * its allowed to be more than the actual number of float registers used as
         * long as its 0-8 */
        |.if X64 and not X64WIN
        | mov al, 8
        |.endif
    }
#endif

    | call extern FUNCTION
    | sub rsp, 48 // 32 to be able to call local functions, 16 so we can store some local variables

    /* note on windows X86 the stack may be only aligned to 4 (stdcall will
     * have popped a multiple of 4 bytes), but we don't need 16 byte alignment on
     * that platform
     */

    lua_rawgeti(L, ct_usr, 0);
    mbr_ct = (const struct ctype*) lua_touserdata(L, -1);

    if (mbr_ct->pointers || mbr_ct->is_reference || mbr_ct->type == INTPTR_TYPE) {
        lua_getuservalue(L, -1);
        num_upvals += 2;
        | mov [rsp+32], rax // save the pointer
        | get_errno
        | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct
        | mov rcx, [rsp+32]
        | mov [rax], rcx // *(void**) cdata = val
        | lua_return_arg

    } else {
        switch (mbr_ct->type) {
        case FUNCTION_PTR_TYPE:
            lua_getuservalue(L, -1);
            num_upvals += 2;
            | mov [rsp+32], rax // save the function pointer
            | get_errno
            | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct
            | mov rcx, [rsp+32]
            | mov [rax], rcx // *(cfunction**) cdata = val
            | lua_return_arg
            break;

        case INT64_TYPE:
#if LUA_VERSION_NUM == 503
            lua_pop(L, 1);
            if (mbr_ct->is_unsigned) {
                | lua_return_ulong
            } else {
                | lua_return_long
            }
#else
            num_upvals++;
            | // save the return value
            |.if X64
            | mov [rsp+32], rax
            |.else
            | mov [rsp+36], edx // high
            | mov [rsp+32], eax // low
            |.endif
            |
            | get_errno
            | call_rrp extern push_cdata, L_ARG, 0, mbr_ct
            |
            | // *(int64_t*) cdata = val
            |.if X64
            | mov rcx, [rsp+32]
            | mov [rax], rcx
            |.else
            | mov rcx, [rsp+36]
            | mov rdx, [rsp+32]
            | mov [rax+4], rcx
            | mov [rax], rdx
            |.endif
            |
            | lua_return_arg
#endif
            break;

        case COMPLEX_FLOAT_TYPE:
            lua_getuservalue(L, -1);
            num_upvals += 2;
            |.if X64
            | // complex floats are returned as two floats packed into xmm0
            | movq qword [rsp+32], xmm0
            |.else
            | // complex floats are returned as floats in eax and edx
            | mov [rsp+32], eax
            | mov [rsp+36], edx
            |.endif
            |
            | get_errno
            | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct
            |
            | // ((complex_float*) cdata) = val
            |.if X64
            | mov rcx, [rsp+32]
            | mov [rax], rcx
            |.else
            | mov ecx, [rsp+32]
            | mov [rax], ecx
            | mov ecx, [rsp+36]
            | mov [rax+4], ecx
            |.endif
            |
            | lua_return_arg
            break;

        case COMPLEX_DOUBLE_TYPE:
            lua_getuservalue(L, -1);
            num_upvals += 2;
            |.if X64
            | // complex doubles are returned as xmm0 and xmm1
            | movq qword [rsp+40], xmm1
            | movq qword [rsp+32], xmm0
            |
            | get_errno
            | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct
            |
            | // ((complex_double*) cdata)->real = val0
            | // ((complex_double*) cdata)->imag = val1
            | mov rcx, [rsp+40]
            | mov [rax+8], rcx
            | mov rcx, [rsp+32]
            | mov [rax], rcx
            |
            |.else
            | // On 32 bit we have already handled this by pushing a new cdata
            | // and handing the cdata ptr in as the hidden first param, but
            | // still need to add mbr_ct as an upval as its used earlier.
            | // Hidden param was popped by called function, we need to realign.
            | sub rsp, 4
            | get_errno
            |.endif
            |
            | lua_return_arg
            break;

        case VOID_TYPE:
            lua_pop(L, 1);
            | lua_return_void
            break;

        case BOOL_TYPE:
            lua_pop(L, 1);
            | lua_return_bool
            break;

        case INT8_TYPE:
            lua_pop(L, 1);
            if (mbr_ct->is_unsigned) {
                | movzx eax, al
            } else {
                | movsx eax, al
            }
            | lua_return_int
            break;

        case INT16_TYPE:
            lua_pop(L, 1);
            if (mbr_ct->is_unsigned) {
                | movzx eax, ax
            } else {
                | movsx eax, ax
            }
            | lua_return_int
            break;

        case INT32_TYPE:
        case ENUM_TYPE:
            lua_pop(L, 1);
            if (mbr_ct->is_unsigned) {
                | lua_return_uint
            } else {
                | lua_return_int
            }
            break;

        case FLOAT_TYPE:
            lua_pop(L, 1);
            |.if X64
            | cvtss2sd xmm0, xmm0
            |.endif
            | lua_return_double
            break;

        case DOUBLE_TYPE:
            lua_pop(L, 1);
            | lua_return_double
            break;

        default:
            luaL_error(L, "NYI: call return type");
        }
    }

    assert(lua_gettop(L) == top + num_upvals);
    {
        cfunction f = compile(Dst, L, func, LUA_NOREF);
        /* add a callback as an upval so that the jitted code gets cleaned up when
         * the function gets gc'd */
        push_callback(L, f, func);
        lua_pushcclosure(L, (lua_CFunction) f, num_upvals+1);
    }
}