diff options
-rw-r--r-- | orc/orcarm.h | 2 | ||||
-rw-r--r-- | orc/orcprogram-neon.c | 61 | ||||
-rw-r--r-- | orc/orcrules-neon.c | 121 |
3 files changed, 176 insertions, 8 deletions
diff --git a/orc/orcarm.h b/orc/orcarm.h index a04bcd8..84b18ca 100644 --- a/orc/orcarm.h +++ b/orc/orcarm.h @@ -440,6 +440,8 @@ ORC_API void orc_arm64_emit_ret (OrcCompiler *p, int Rn); orc_arm64_emit_am(p,bits,ORC_ARM64_DP_ADD,ORC_ARM64_TYPE_REG,0,Rd,Rn,Rm,0) #define orc_arm64_emit_add_lsl(p,bits,Rd,Rn,Rm,val) \ orc_arm64_emit_am(p,bits,ORC_ARM64_DP_ADD,ORC_ARM64_TYPE_REG,ORC_ARM_LSL,Rd,Rn,Rm,val) +#define orc_arm64_emit_add_lsr(p,bits,Rd,Rn,Rm,val) \ + orc_arm64_emit_am(p,bits,ORC_ARM64_DP_ADD,ORC_ARM64_TYPE_REG,ORC_ARM_LSR,Rd,Rn,Rm,val) #define orc_arm64_emit_add_asr(p,bits,Rd,Rn,Rm,val) \ orc_arm64_emit_am(p,bits,ORC_ARM64_DP_ADD,ORC_ARM64_TYPE_REG,ORC_ARM_ASR,Rd,Rn,Rm,val) #define orc_arm64_emit_add_ror(p,bits,Rd,Rn,Rm,val) \ diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index 8d55411..1e0ab2c 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -311,6 +311,17 @@ orc_compiler_neon_init (OrcCompiler *compiler) compiler->unroll_shift = 0; } + if (compiler->is_64bit) { /* The loadupdb is aarch64 only so far */ + for(i=0;i<compiler->n_insns;i++){ + OrcInstruction *insn = compiler->insns + i; + OrcStaticOpcode *opcode = insn->opcode; + + if (strcmp (opcode->name, "loadupdb") == 0) { + compiler->vars[insn->src_args[0]].need_offset_reg = TRUE; + } + } + } + if (0) { compiler->need_mask_regs = TRUE; } @@ -343,6 +354,34 @@ orc_neon_load_constants_outer (OrcCompiler *compiler) } orc_compiler_emit_invariants (compiler); + + if (compiler->is_64bit) { /* The loadupdb is aarch64 only so far */ + for(i=0;i<compiler->n_insns;i++){ + OrcInstruction *insn = compiler->insns + i; + OrcStaticOpcode *opcode = insn->opcode; + + if (strcmp (opcode->name, "loadupdb") == 0) { + if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { + orc_arm64_emit_load_reg (compiler, 64, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->exec_reg, + ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]])); + } else { + if (!compiler->vars[insn->src_args[0]].ptr_offset) + continue; + if (!compiler->vars[insn->src_args[1]].value.i) + orc_arm64_emit_eor(compiler, 64, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[0]].ptr_offset); + else + orc_arm64_emit_load_imm(compiler, 64, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[1]].value.i); + } + } + } + } } static void @@ -363,6 +402,11 @@ orc_neon_load_constants_inner (OrcCompiler *compiler) orc_arm64_emit_load_reg (compiler, 64, compiler->vars[i].ptr_register, compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); + if (compiler->vars[i].ptr_offset) + orc_arm64_emit_eor(compiler, 64, + compiler->vars[i].ptr_offset, + compiler->vars[i].ptr_offset, + compiler->vars[i].ptr_offset); } else { orc_arm_emit_load_reg (compiler, compiler->vars[i].ptr_register, @@ -1125,22 +1169,23 @@ orc_neon_emit_loop (OrcCompiler *compiler, int unroll_index) if (compiler->vars[k].name == NULL) continue; if (compiler->vars[k].vartype == ORC_VAR_TYPE_SRC || compiler->vars[k].vartype == ORC_VAR_TYPE_DEST) { - if (compiler->vars[k].ptr_register) { - if (compiler->is_64bit) + if (compiler->is_64bit) { + if (compiler->vars[k].ptr_offset) { + orc_arm64_emit_add_imm (compiler, 64, + compiler->vars[k].ptr_offset, + compiler->vars[k].ptr_offset, + compiler->vars[k].size << compiler->loop_shift); + } else if (compiler->vars[k].ptr_register) { orc_arm64_emit_add_imm (compiler, 64, compiler->vars[k].ptr_register, compiler->vars[k].ptr_register, compiler->vars[k].size << compiler->loop_shift); - else + } + } else { orc_arm_emit_add_imm (compiler, compiler->vars[k].ptr_register, compiler->vars[k].ptr_register, compiler->vars[k].size << compiler->loop_shift); - } else { - /* arm_emit_add_imm_memoffset (compiler, arm_ptr_size, */ - /* compiler->vars[k].size << compiler->loop_shift, */ - /* (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]), */ - /* p->exec_reg); */ } } } diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index 7e8e3db..2764e9e 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -1077,6 +1077,126 @@ orc_neon_storeq (OrcCompiler *compiler, int dest, int update, int src1, int is_a #endif static void +neon_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) +{ + OrcVariable *src = compiler->vars + insn->src_args[0]; + unsigned int code = 0; + int size = src->size << compiler->insn_shift; + ORC_ASSERT(src->ptr_register); /* can ptr_register be 0 ? */ + int ptr_reg; + + if (!compiler->is_64bit) { + ORC_COMPILER_ERROR(compiler, "loadupdb is implemented only on aarch64"); + return; + } + + /* FIXME this should be fixed at a higher level */ + if (src->vartype != ORC_VAR_TYPE_SRC && src->vartype != ORC_VAR_TYPE_DEST) { + ORC_COMPILER_ERROR(compiler, "loadX used with non src/dest"); + return; + } + + if (src->ptr_offset) { + ptr_reg = compiler->gp_tmpreg; + orc_arm64_emit_add_lsr(compiler, 64, ptr_reg, src->ptr_register, src->ptr_offset, 1); + } else { + ptr_reg = src->ptr_register; + } + + int opcode, flag; + + if (size > 16) { + /** load multiple single-element structures to one, two, three, or four registers */ + char vt_str[64]; + + memset(vt_str, '\x00', 64); + + if (size == 64) { + snprintf(vt_str, 64, "%s, %s, %s, %s", + orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 1, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 2, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 3, 1, 1)); + opcode = 0x2; + } else if (size == 32) { + snprintf(vt_str, 64, "%s, %s", + orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 1, 1, 1)); + opcode = 0xa; + } else if (size == 16) { + snprintf(vt_str, 64, "%s", + orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1)); + opcode = 0x7; + } else { + ORC_COMPILER_ERROR(compiler,"bad aligned load size %d", + src->size << compiler->insn_shift); + return; + } + flag = 0; /* Bytes */ + + ORC_ASM_CODE(compiler," ld1 { %s }, [%s]\n", + vt_str, orc_arm64_reg_name (ptr_reg, 64)); + code = 0x0c400000; + code |= 0 << 30; /* Q-bit */ + code |= (flag&0x3) << 10; + code |= (opcode&0xf) << 12; + } else { + /** load one single-element structure to one lane of one register */ + flag = 0; + if (size == 8) { + opcode = 4; + flag = 1; /* size==01 */ + } else if (size == 4) { + opcode = 4; + } else if (size == 2) { + opcode = 2; + } else if (size == 1) { + opcode = 0; + } else { + ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d", + src->size << compiler->insn_shift); + return; + } + ORC_ASM_CODE(compiler," ld1 { %s }[0], [%s]\n", + orc_neon64_reg_name_vector_single (compiler->tmpreg, size), + orc_arm64_reg_name (ptr_reg, 64)); + code = 0x0d400000; + code |= (opcode&0x7) << 13; + code |= (flag&0x3) << 10; + } + + code |= (ptr_reg&0x1f) << 5; + code |= (compiler->tmpreg&0x1f); + + orc_arm_emit (compiler, code); + + OrcVariable tmpreg = { .alloc = compiler->tmpreg, .size = compiler->vars[insn->src_args[0]].size }; + + switch (src->size) { + case 1: + orc_neon64_emit_binary (compiler, "zip1", 0x0e003800, + compiler->vars[insn->dest_args[0]], + tmpreg, + tmpreg, compiler->insn_shift - 1); + break; + case 2: + orc_neon64_emit_binary (compiler, "zip1", 0x0e403800, + compiler->vars[insn->dest_args[0]], + tmpreg, + tmpreg, compiler->insn_shift - 1); + break; + case 4: + orc_neon64_emit_binary (compiler, "zip1", 0x0e803800, + compiler->vars[insn->dest_args[0]], + tmpreg, + tmpreg, compiler->insn_shift - 1); + break; + } + + src->update_type = 1; +} + +static void neon_rule_loadpX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; @@ -4388,6 +4508,7 @@ orc_compiler_neon_register_rules (OrcTarget *target) orc_rule_register (rule_set, "loadpw", neon_rule_loadpX, (void *)2); orc_rule_register (rule_set, "loadpl", neon_rule_loadpX, (void *)4); orc_rule_register (rule_set, "loadpq", neon_rule_loadpX, (void *)8); + orc_rule_register (rule_set, "loadupdb", neon_rule_loadupdb, (void *)0); orc_rule_register (rule_set, "loadb", neon_rule_loadX, (void *)0); orc_rule_register (rule_set, "loadw", neon_rule_loadX, (void *)0); orc_rule_register (rule_set, "loadl", neon_rule_loadX, (void *)0); |