diff options
author | Dongju Chae <dongju.chae@samsung.com> | 2019-10-21 14:21:47 +0300 |
---|---|---|
committer | Marek Vasut <marex@denx.de> | 2020-09-16 15:27:27 +0300 |
commit | 6e6cd76110788c9271d7384350c8a848905c396a (patch) | |
tree | 9b3860f871f03689ff2b3fe1873cf83529cf4e3c | |
parent | 1f36d3a371ecf2dc1166e0434a9b83b074806a2a (diff) |
aarch64: orcprogram-neon porting to aarch64
This PR ports orcprogram-neon.c for AArch64 support.
It makes the orc compiler generate aarch64 assembly codes.
Currently, example1 succesfully runs on aarch64 (tested on an Android device).
Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
-rw-r--r-- | orc/orcprogram-neon.c | 665 |
1 files changed, 472 insertions, 193 deletions
diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index cf87249..6793622 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -34,12 +34,15 @@ orc_neon_emit_prologue (OrcCompiler *compiler) { unsigned int regs = 0; orc_uint32 vregs = 0; + int num_gregs; int i; orc_compiler_append_code(compiler,".global %s\n", compiler->program->name); orc_compiler_append_code(compiler,"%s:\n", compiler->program->name); - for(i=0;i<16;i++){ + num_gregs = compiler->is_64bit ? 32 : 16; + + for(i=0;i<num_gregs;i++){ if (compiler->used_regs[ORC_GP_REG_BASE + i] && compiler->save_regs[ORC_GP_REG_BASE + i]) { regs |= (1<<i); @@ -82,10 +85,13 @@ static void orc_neon_emit_epilogue (OrcCompiler *compiler) { int i; + int num_gregs; unsigned int regs = 0; orc_uint32 vregs = 0; - for(i=0;i<16;i++){ + num_gregs = compiler->is_64bit ? 32 : 16; + + for(i=0;i<num_gregs;i++){ if (compiler->used_regs[ORC_GP_REG_BASE + i] && compiler->save_regs[ORC_GP_REG_BASE + i]) { regs |= (1<<i); @@ -157,36 +163,78 @@ orc_compiler_neon_init (OrcCompiler *compiler) int i; int loop_shift; - for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){ - compiler->valid_regs[i] = 1; + if (compiler->target_flags & ORC_TARGET_NEON_64BIT) { + compiler->is_64bit = TRUE; } - for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i+=2){ - compiler->valid_regs[i] = 1; - } - /* compiler->valid_regs[ORC_ARM_SB] = 0; */ - compiler->valid_regs[ORC_ARM_IP] = 0; - compiler->valid_regs[ORC_ARM_SP] = 0; - compiler->valid_regs[ORC_ARM_LR] = 0; - compiler->valid_regs[ORC_ARM_PC] = 0; - for(i=4;i<12;i++) { - compiler->save_regs[ORC_GP_REG_BASE+i] = 1; + + if (compiler->is_64bit) { + /** AArch64 + * 31 64-bit generic-purpose registers (R0-R30) and SP + * 32 128-bit vector registers (do not overlap multiple registers in a narrower view) + * Note that PC is not a generic-purpose register in AArch64 + */ + for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+32;i++){ + compiler->valid_regs[i] = 1; + } + for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i++){ + compiler->valid_regs[i] = 1; + } + + compiler->valid_regs[ORC_ARM64_IP0] = 0; + compiler->valid_regs[ORC_ARM64_IP1] = 0; + + compiler->valid_regs[ORC_ARM64_FP] = 0; + compiler->valid_regs[ORC_ARM64_LR] = 0; + compiler->valid_regs[ORC_ARM64_SP] = 0; + + /** r19 to r29 are callee-saved */ + for(i=19;i<29;i++) { + compiler->save_regs[ORC_GP_REG_BASE+i] = 1; + } + } else { + /** AArch32 + * 16 32-bit generic-purpose registers (R0-R15) + * 32 64-bit vector registers (smaller registers are packed into larger ones) + */ + for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){ + compiler->valid_regs[i] = 1; + } + for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i+=2){ + compiler->valid_regs[i] = 1; + } + /* compiler->valid_regs[ORC_ARM_SB] = 0; */ + compiler->valid_regs[ORC_ARM_IP] = 0; + compiler->valid_regs[ORC_ARM_SP] = 0; + compiler->valid_regs[ORC_ARM_LR] = 0; + compiler->valid_regs[ORC_ARM_PC] = 0; + + for(i=4;i<12;i++) { + compiler->save_regs[ORC_GP_REG_BASE+i] = 1; + } } + + /** Both architectures have 8 callee-saved SIMD registers (v8-v15) */ for(i=8;i<16;i++) { compiler->save_regs[ORC_VEC_REG_BASE+i] = 1; } - + for(i=0;i<ORC_N_REGS;i++){ compiler->alloc_regs[i] = 0; compiler->used_regs[i] = 0; } compiler->exec_reg = ORC_ARM_A1; - compiler->valid_regs[compiler->exec_reg] = 0; compiler->gp_tmpreg = ORC_ARM_A2; + if (compiler->is_64bit) { + compiler->tmpreg = ORC_VEC_REG_BASE + 0; + compiler->tmpreg2 = ORC_VEC_REG_BASE + 1; + } else { + compiler->tmpreg = ORC_VEC_REG_BASE + 0; + compiler->tmpreg2 = ORC_VEC_REG_BASE + 2; + } + compiler->valid_regs[compiler->exec_reg] = 0; compiler->valid_regs[compiler->gp_tmpreg] = 0; - compiler->tmpreg = ORC_VEC_REG_BASE + 0; compiler->valid_regs[compiler->tmpreg] = 0; - compiler->tmpreg2 = ORC_VEC_REG_BASE + 2; compiler->valid_regs[compiler->tmpreg2] = 0; loop_shift = 0; @@ -311,9 +359,15 @@ orc_neon_load_constants_inner (OrcCompiler *compiler) break; case ORC_VAR_TYPE_SRC: case ORC_VAR_TYPE_DEST: - orc_arm_emit_load_reg (compiler, - compiler->vars[i].ptr_register, - compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); + if (compiler->is_64bit) { + orc_arm64_emit_load_reg (compiler, 32, + compiler->vars[i].ptr_register, + compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); + } else { + orc_arm_emit_load_reg (compiler, + compiler->vars[i].ptr_register, + compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); + } break; case ORC_VAR_TYPE_ACCUMULATOR: break; @@ -476,194 +530,160 @@ enum { LABEL_L1L2_AFTER, }; +#define ORC_NEON_ALIGNED_DEST_CUTOFF 64 + static void -orc_compiler_neon_assemble (OrcCompiler *compiler) +orc_neon64_loop_shift (OrcCompiler *compiler) { - int align_var; - int align_shift; - int var_size_shift; - int i; - int set_fpscr = FALSE; - - align_var = get_align_var (compiler); - if (compiler->error) return; - - var_size_shift = get_shift (compiler->vars[align_var].size); - align_shift = 4; - - compiler->vars[align_var].is_aligned = FALSE; - - orc_neon_emit_prologue (compiler); - - if (orc_program_has_float (compiler)) { - set_fpscr = TRUE; - ORC_ASM_CODE (compiler," vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg)); - orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12)); - ORC_ASM_CODE (compiler," push %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); - orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12)); - - orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24); - ORC_ASM_CODE (compiler," vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); - orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12)); - } - - orc_neon_load_constants_outer (compiler); - - if (compiler->program->is_2d) { - if (compiler->program->constant_m > 0) { - orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2])); - } else { - orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1])); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2])); - } - - orc_arm_emit_label (compiler, LABEL_OUTER_LOOP); - } + int align_var = get_align_var (compiler); + int var_size_shift = get_shift (compiler->vars[align_var].size); + int align_shift = 4; -#define ORC_NEON_ALIGNED_DEST_CUTOFF 64 - - if (compiler->loop_shift > 0 && compiler->n_insns < 5) { - orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + if (compiler->n_insns < 5) { + /** Get the number of loops (N) from OrcExecutor */ + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); - orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF); + + /** if N > ORC_NEON_ALIGNED_DEST_CUTOFF, go to LABEL_REGION0_SKIP */ + orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_R2, ORC_NEON_ALIGNED_DEST_CUTOFF); orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP); - orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3, + /** counter2 = N >> loop shift */ + orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_R1, ORC_ARM64_R2, compiler->loop_shift); - orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg, + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R1, compiler->exec_reg, (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); - orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, + /** counter3 = N & loop shift */ + orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R2, (1<<compiler->loop_shift)-1); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + /** load function arguments */ orc_neon_load_constants_inner (compiler); - orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + + /** if counter2 == zero, go to LABEL_REGION2_SKIP */ + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); - orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); + orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0); orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP); + /** vector calculation loop */ compiler->size_region = 0; orc_arm_emit_label (compiler, LABEL_REGION0_LOOP); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); + + /** vector instructions: @todo port to aarch64 */ orc_neon_emit_loop (compiler, -1); - orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP); - + /** if counter2 != zero, repeat loop */ + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP); + /** else go to LABEL_REGION2_SKIP */ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); - orc_arm_emit_label (compiler, LABEL_REGION0_SKIP); } - if (compiler->loop_shift > 0) { - orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift); - - orc_arm_emit_load_reg (compiler, ORC_ARM_A2, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var])); - orc_arm_emit_sub (compiler, ORC_ARM_IP, ORC_ARM_IP, ORC_ARM_A2); - orc_arm_emit_and_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, - (1<<align_shift)-1); - if (var_size_shift > 0) { - orc_arm_emit_asr_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, var_size_shift); - } - - orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); - orc_arm_emit_cmp (compiler, ORC_ARM_A3, ORC_ARM_IP); - orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION); - - orc_arm_emit_store_reg (compiler, ORC_ARM_IP, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); - orc_arm_emit_sub (compiler, ORC_ARM_A2, ORC_ARM_A3, ORC_ARM_IP); - - orc_arm_emit_asr_imm (compiler, ORC_ARM_A3, ORC_ARM_A2, - compiler->loop_shift + compiler->unroll_shift); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); - - orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A2, - (1<<(compiler->loop_shift + compiler->unroll_shift))-1); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); - - orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER); - orc_arm_emit_label (compiler, LABEL_ONE_REGION); - - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); - - orc_arm_emit_load_imm (compiler, ORC_ARM_A3, 0); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); - - orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER); + /** IP0 = 1 << align_shift */ + orc_arm64_emit_mov_imm (compiler, 32, ORC_ARM64_IP0, 1<<align_shift); + + /** r1 == ORC_VAR_D1 */ + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R1, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var])); + /** IP0 = IP0 - r1 */ + orc_arm64_emit_sub (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, ORC_ARM64_R1); + /** IP0 = IP0 & ((1 << aligned_shift) -1) */ + orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, + (1<<align_shift)-1); + if (var_size_shift > 0) { + /** IP0 = IP0 >> var_size_shift */ + orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, var_size_shift); } - orc_neon_load_constants_inner (compiler); - - if (compiler->loop_shift > 0) { - int save_loop_shift = compiler->loop_shift; - compiler->loop_shift = 0; - - orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); - - orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); - orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP); - - orc_arm_emit_label (compiler, LABEL_REGION1_LOOP); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); - orc_neon_emit_loop (compiler, -1); - orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP); - orc_arm_emit_label (compiler, LABEL_REGION1_SKIP); - - compiler->loop_shift = save_loop_shift; - compiler->vars[align_var].is_aligned = TRUE; - } + /** r2 = N */ + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); + /** N <= IP0, go to LABEL_ONE_REGION */ + orc_arm64_emit_cmp (compiler, 32, ORC_ARM64_R2, ORC_ARM64_IP0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION); + + /** counter1 = IP0 */ + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); + /** r1 = r2 - IP0 */ + orc_arm64_emit_sub (compiler, 32, ORC_ARM64_R1, ORC_ARM64_R2, ORC_ARM64_IP0); + + /** r2 = r1 >> (loop_shift + unroll_shift) */ + orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R1, + compiler->loop_shift + compiler->unroll_shift); + /** counter2 = r2 */ + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + + /** r2 = r1 & ((1<<(loop_shift + unroll_shift))-1) */ + orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R1, + (1<<(compiler->loop_shift + compiler->unroll_shift))-1); + /** counter3 = r2 */ + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + /** go to LABEL_ONE_REGION_AFTER */ + orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER); + orc_arm_emit_label (compiler, LABEL_ONE_REGION); + + /** counter1 = r2 */ + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); + /** counter2 = counter3 = 0 */ + orc_arm64_emit_mov_uimm (compiler, 32, ORC_ARM64_R2, 0); + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER); +} - if (compiler->loop_shift > 0) { - orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); - } else { - orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); - } +static void +orc_neon64_loop_caches (OrcCompiler *compiler) +{ + int align_var = get_align_var (compiler); + int var_size_shift = get_shift (compiler->vars[align_var].size); + int i; - orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); + /** if IP0 == 0, go to LABEL_REGION2_SKIP */ + orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0); orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP); - orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP, + /** r1 = IP0 >> (17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift) */ + orc_arm64_emit_asr_imm (compiler, 32, compiler->gp_tmpreg, ORC_ARM64_IP0, 17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift); - orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0); + + /** if r1 == 0, go to LABEL_REGION2_MEDIUM */ + orc_arm64_emit_cmp_imm (compiler, 32, compiler->gp_tmpreg, 0); orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_MEDIUM); - /* N is larger than L2 cache size */ + /** N is larger than L2 cache size */ compiler->size_region = 3; orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); for(i=0;i<(1<<compiler->unroll_shift);i++){ orc_neon_emit_loop (compiler, i); } orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE); + /** DONE, let's finish */ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); orc_arm_emit_label (compiler, LABEL_REGION2_MEDIUM); - orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP, + orc_arm64_emit_asr_imm (compiler, 32, compiler->gp_tmpreg, ORC_ARM64_IP0, 13 + var_size_shift - compiler->loop_shift - compiler->unroll_shift); - orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0); + orc_arm64_emit_cmp_imm (compiler, 32, compiler->gp_tmpreg, 0); orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SMALL); /* N is smaller than L2 cache size */ compiler->size_region = 2; orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); for(i=0;i<(1<<compiler->unroll_shift);i++){ orc_neon_emit_loop (compiler, i); } @@ -671,48 +691,299 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); orc_arm_emit_label (compiler, LABEL_REGION2_SMALL); - /* N is smaller than L2 cache size */ + /* N is smaller than L1 cache size */ compiler->size_region = 1; orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); for(i=0;i<(1<<compiler->unroll_shift);i++){ orc_neon_emit_loop (compiler, i); } orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL); orc_arm_emit_label (compiler, LABEL_REGION2_SKIP); +} + +#define orc_neon64_loop_shift_remainder(compiler,counter,label_loop,label_skip) \ +{ \ + int save_loop_shift = compiler->loop_shift; \ + compiler->loop_shift = 0; \ + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, \ + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter)); \ + orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0); \ + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, label_skip); \ + orc_arm_emit_label (compiler, label_loop); \ + orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); \ + orc_neon_emit_loop (compiler, -1); \ + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, label_loop); \ + orc_arm_emit_label (compiler, label_skip); \ + compiler->loop_shift = save_loop_shift; \ +} - if (compiler->loop_shift > 0) { - int save_loop_shift = compiler->loop_shift; +static void +orc_compiler_neon_assemble (OrcCompiler *compiler) +{ + int align_var; + int align_shift; + int var_size_shift; + int i; + int set_fpscr = FALSE; - compiler->loop_shift = 0; + align_var = get_align_var (compiler); + if (compiler->error) return; - compiler->vars[align_var].is_aligned = FALSE; + var_size_shift = get_shift (compiler->vars[align_var].size); + align_shift = 4; - orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + compiler->vars[align_var].is_aligned = FALSE; + + orc_neon_emit_prologue (compiler); + + if (!compiler->is_64bit && orc_program_has_float (compiler)) { + set_fpscr = TRUE; + ORC_ASM_CODE (compiler," vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12)); + ORC_ASM_CODE (compiler," push %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12)); + + orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24); + ORC_ASM_CODE (compiler," vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg)); + orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12)); + } + + orc_neon_load_constants_outer (compiler); + + if (compiler->is_64bit) { + /** @todo not supported yet */ + if (compiler->program->is_2d) return; + + if (compiler->loop_shift > 0) { + orc_neon64_loop_shift (compiler); + + orc_neon_load_constants_inner (compiler); + + orc_neon64_loop_shift_remainder (compiler, counter1, + LABEL_REGION1_LOOP, LABEL_REGION1_SKIP); + compiler->vars[align_var].is_aligned = TRUE; + + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + + orc_neon64_loop_caches (compiler); + + compiler->vars[align_var].is_aligned = FALSE; + orc_neon64_loop_shift_remainder (compiler, counter3, + LABEL_REGION3_LOOP, LABEL_REGION3_SKIP); + } else { + orc_neon_load_constants_inner (compiler); + + orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); + + orc_neon64_loop_caches (compiler); + } + } else { + if (compiler->program->is_2d) { + if (compiler->program->constant_m > 0) { + orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2])); + } else { + orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1])); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2])); + } + + orc_arm_emit_label (compiler, LABEL_OUTER_LOOP); + } + + if (compiler->loop_shift > 0 && compiler->n_insns < 5) { + orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); + + orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF); + orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP); + + orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3, + compiler->loop_shift); + orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + + orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, + (1<<compiler->loop_shift)-1); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + orc_neon_load_constants_inner (compiler); + orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP); + + compiler->size_region = 0; + orc_arm_emit_label (compiler, LABEL_REGION0_LOOP); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + + orc_neon_emit_loop (compiler, -1); + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP); + orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); + orc_arm_emit_label (compiler, LABEL_REGION0_SKIP); + } + + if (compiler->loop_shift > 0) { + orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift); + + orc_arm_emit_load_reg (compiler, ORC_ARM_A2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var])); + orc_arm_emit_sub (compiler, ORC_ARM_IP, ORC_ARM_IP, ORC_ARM_A2); + orc_arm_emit_and_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, + (1<<align_shift)-1); + if (var_size_shift > 0) { + orc_arm_emit_asr_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, var_size_shift); + } + + orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); + orc_arm_emit_cmp (compiler, ORC_ARM_A3, ORC_ARM_IP); + orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION); + + orc_arm_emit_store_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); + orc_arm_emit_sub (compiler, ORC_ARM_A2, ORC_ARM_A3, ORC_ARM_IP); + + orc_arm_emit_asr_imm (compiler, ORC_ARM_A3, ORC_ARM_A2, + compiler->loop_shift + compiler->unroll_shift); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + + orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A2, + (1<<(compiler->loop_shift + compiler->unroll_shift))-1); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER); + orc_arm_emit_label (compiler, LABEL_ONE_REGION); + + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); + + orc_arm_emit_load_imm (compiler, ORC_ARM_A3, 0); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER); + } + + orc_neon_load_constants_inner (compiler); + + if (compiler->loop_shift > 0) { + int save_loop_shift = compiler->loop_shift; + compiler->loop_shift = 0; + + orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1)); + + orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP); + + orc_arm_emit_label (compiler, LABEL_REGION1_LOOP); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_neon_emit_loop (compiler, -1); + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP); + orc_arm_emit_label (compiler, LABEL_REGION1_SKIP); + + compiler->loop_shift = save_loop_shift; + compiler->vars[align_var].is_aligned = TRUE; + } + + if (compiler->loop_shift > 0) { + orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + } else { + orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); + } orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); - orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP); + + orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP, + 17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift); + orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_MEDIUM); - orc_arm_emit_label (compiler, LABEL_REGION3_LOOP); + /* N is larger than L2 cache size */ + compiler->size_region = 3; + orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE); orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); - orc_neon_emit_loop (compiler, -1); - orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP); - orc_arm_emit_label (compiler, LABEL_REGION3_SKIP); + for(i=0;i<(1<<compiler->unroll_shift);i++){ + orc_neon_emit_loop (compiler, i); + } + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE); + orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); - compiler->loop_shift = save_loop_shift; - } + orc_arm_emit_label (compiler, LABEL_REGION2_MEDIUM); + orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP, + 13 + var_size_shift - compiler->loop_shift - compiler->unroll_shift); + orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SMALL); + + /* N is smaller than L2 cache size */ + compiler->size_region = 2; + orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + for(i=0;i<(1<<compiler->unroll_shift);i++){ + orc_neon_emit_loop (compiler, i); + } + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_MEDIUM); + orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); + + orc_arm_emit_label (compiler, LABEL_REGION2_SMALL); + /* N is smaller than L2 cache size */ + compiler->size_region = 1; + orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + for(i=0;i<(1<<compiler->unroll_shift);i++){ + orc_neon_emit_loop (compiler, i); + } + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL); + + orc_arm_emit_label (compiler, LABEL_REGION2_SKIP); - if (compiler->program->is_2d) { - neon_add_strides (compiler); + if (compiler->loop_shift > 0) { + int save_loop_shift = compiler->loop_shift; - orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2])); - orc_arm_emit_sub_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, 1, TRUE); - orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, - (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2])); - orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_OUTER_LOOP); + compiler->loop_shift = 0; + + compiler->vars[align_var].is_aligned = FALSE; + + orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP); + + orc_arm_emit_label (compiler, LABEL_REGION3_LOOP); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_neon_emit_loop (compiler, -1); + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP); + orc_arm_emit_label (compiler, LABEL_REGION3_SKIP); + + compiler->loop_shift = save_loop_shift; + } + + if (compiler->program->is_2d) { + neon_add_strides (compiler); + + orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2])); + orc_arm_emit_sub_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, 1, TRUE); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2])); + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_OUTER_LOOP); + } } orc_neon_save_accumulators (compiler); @@ -729,11 +1000,13 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_arm_emit_align (compiler, 4); - orc_arm_emit_label (compiler, 20); - orc_arm_emit_data (compiler, 0x07060706); - orc_arm_emit_data (compiler, 0x07060706); - orc_arm_emit_data (compiler, 0x0f0e0f0e); - orc_arm_emit_data (compiler, 0x0f0e0f0e); + if (!compiler->is_64bit) { + orc_arm_emit_label (compiler, 20); + orc_arm_emit_data (compiler, 0x07060706); + orc_arm_emit_data (compiler, 0x07060706); + orc_arm_emit_data (compiler, 0x0f0e0f0e); + orc_arm_emit_data (compiler, 0x0f0e0f0e); + } orc_arm_do_fixups (compiler); } @@ -830,10 +1103,16 @@ orc_neon_emit_loop (OrcCompiler *compiler, int unroll_index) if (compiler->vars[k].vartype == ORC_VAR_TYPE_SRC || compiler->vars[k].vartype == ORC_VAR_TYPE_DEST) { if (compiler->vars[k].ptr_register) { - orc_arm_emit_add_imm (compiler, - compiler->vars[k].ptr_register, - compiler->vars[k].ptr_register, - compiler->vars[k].size << compiler->loop_shift); + if (compiler->is_64bit) + orc_arm64_emit_add_imm (compiler, 32, + compiler->vars[k].ptr_register, + compiler->vars[k].ptr_register, + compiler->vars[k].size << compiler->loop_shift); + else + orc_arm_emit_add_imm (compiler, + compiler->vars[k].ptr_register, + compiler->vars[k].ptr_register, + compiler->vars[k].size << compiler->loop_shift); } else { /* arm_emit_add_imm_memoffset (compiler, arm_ptr_size, */ /* compiler->vars[k].size << compiler->loop_shift, */ |