Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/GStreamer/orc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongju Chae <dongju.chae@samsung.com>2019-10-21 14:21:47 +0300
committerMarek Vasut <marex@denx.de>2020-09-16 15:27:27 +0300
commit6e6cd76110788c9271d7384350c8a848905c396a (patch)
tree9b3860f871f03689ff2b3fe1873cf83529cf4e3c
parent1f36d3a371ecf2dc1166e0434a9b83b074806a2a (diff)
aarch64: orcprogram-neon porting to aarch64
This PR ports orcprogram-neon.c for AArch64 support. It makes the orc compiler generate aarch64 assembly codes. Currently, example1 succesfully runs on aarch64 (tested on an Android device). Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
-rw-r--r--orc/orcprogram-neon.c665
1 files changed, 472 insertions, 193 deletions
diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c
index cf87249..6793622 100644
--- a/orc/orcprogram-neon.c
+++ b/orc/orcprogram-neon.c
@@ -34,12 +34,15 @@ orc_neon_emit_prologue (OrcCompiler *compiler)
{
unsigned int regs = 0;
orc_uint32 vregs = 0;
+ int num_gregs;
int i;
orc_compiler_append_code(compiler,".global %s\n", compiler->program->name);
orc_compiler_append_code(compiler,"%s:\n", compiler->program->name);
- for(i=0;i<16;i++){
+ num_gregs = compiler->is_64bit ? 32 : 16;
+
+ for(i=0;i<num_gregs;i++){
if (compiler->used_regs[ORC_GP_REG_BASE + i] &&
compiler->save_regs[ORC_GP_REG_BASE + i]) {
regs |= (1<<i);
@@ -82,10 +85,13 @@ static void
orc_neon_emit_epilogue (OrcCompiler *compiler)
{
int i;
+ int num_gregs;
unsigned int regs = 0;
orc_uint32 vregs = 0;
- for(i=0;i<16;i++){
+ num_gregs = compiler->is_64bit ? 32 : 16;
+
+ for(i=0;i<num_gregs;i++){
if (compiler->used_regs[ORC_GP_REG_BASE + i] &&
compiler->save_regs[ORC_GP_REG_BASE + i]) {
regs |= (1<<i);
@@ -157,36 +163,78 @@ orc_compiler_neon_init (OrcCompiler *compiler)
int i;
int loop_shift;
- for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
- compiler->valid_regs[i] = 1;
+ if (compiler->target_flags & ORC_TARGET_NEON_64BIT) {
+ compiler->is_64bit = TRUE;
}
- for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i+=2){
- compiler->valid_regs[i] = 1;
- }
- /* compiler->valid_regs[ORC_ARM_SB] = 0; */
- compiler->valid_regs[ORC_ARM_IP] = 0;
- compiler->valid_regs[ORC_ARM_SP] = 0;
- compiler->valid_regs[ORC_ARM_LR] = 0;
- compiler->valid_regs[ORC_ARM_PC] = 0;
- for(i=4;i<12;i++) {
- compiler->save_regs[ORC_GP_REG_BASE+i] = 1;
+
+ if (compiler->is_64bit) {
+ /** AArch64
+ * 31 64-bit generic-purpose registers (R0-R30) and SP
+ * 32 128-bit vector registers (do not overlap multiple registers in a narrower view)
+ * Note that PC is not a generic-purpose register in AArch64
+ */
+ for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+32;i++){
+ compiler->valid_regs[i] = 1;
+ }
+ for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i++){
+ compiler->valid_regs[i] = 1;
+ }
+
+ compiler->valid_regs[ORC_ARM64_IP0] = 0;
+ compiler->valid_regs[ORC_ARM64_IP1] = 0;
+
+ compiler->valid_regs[ORC_ARM64_FP] = 0;
+ compiler->valid_regs[ORC_ARM64_LR] = 0;
+ compiler->valid_regs[ORC_ARM64_SP] = 0;
+
+ /** r19 to r29 are callee-saved */
+ for(i=19;i<29;i++) {
+ compiler->save_regs[ORC_GP_REG_BASE+i] = 1;
+ }
+ } else {
+ /** AArch32
+ * 16 32-bit generic-purpose registers (R0-R15)
+ * 32 64-bit vector registers (smaller registers are packed into larger ones)
+ */
+ for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
+ compiler->valid_regs[i] = 1;
+ }
+ for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i+=2){
+ compiler->valid_regs[i] = 1;
+ }
+ /* compiler->valid_regs[ORC_ARM_SB] = 0; */
+ compiler->valid_regs[ORC_ARM_IP] = 0;
+ compiler->valid_regs[ORC_ARM_SP] = 0;
+ compiler->valid_regs[ORC_ARM_LR] = 0;
+ compiler->valid_regs[ORC_ARM_PC] = 0;
+
+ for(i=4;i<12;i++) {
+ compiler->save_regs[ORC_GP_REG_BASE+i] = 1;
+ }
}
+
+ /** Both architectures have 8 callee-saved SIMD registers (v8-v15) */
for(i=8;i<16;i++) {
compiler->save_regs[ORC_VEC_REG_BASE+i] = 1;
}
-
+
for(i=0;i<ORC_N_REGS;i++){
compiler->alloc_regs[i] = 0;
compiler->used_regs[i] = 0;
}
compiler->exec_reg = ORC_ARM_A1;
- compiler->valid_regs[compiler->exec_reg] = 0;
compiler->gp_tmpreg = ORC_ARM_A2;
+ if (compiler->is_64bit) {
+ compiler->tmpreg = ORC_VEC_REG_BASE + 0;
+ compiler->tmpreg2 = ORC_VEC_REG_BASE + 1;
+ } else {
+ compiler->tmpreg = ORC_VEC_REG_BASE + 0;
+ compiler->tmpreg2 = ORC_VEC_REG_BASE + 2;
+ }
+ compiler->valid_regs[compiler->exec_reg] = 0;
compiler->valid_regs[compiler->gp_tmpreg] = 0;
- compiler->tmpreg = ORC_VEC_REG_BASE + 0;
compiler->valid_regs[compiler->tmpreg] = 0;
- compiler->tmpreg2 = ORC_VEC_REG_BASE + 2;
compiler->valid_regs[compiler->tmpreg2] = 0;
loop_shift = 0;
@@ -311,9 +359,15 @@ orc_neon_load_constants_inner (OrcCompiler *compiler)
break;
case ORC_VAR_TYPE_SRC:
case ORC_VAR_TYPE_DEST:
- orc_arm_emit_load_reg (compiler,
- compiler->vars[i].ptr_register,
- compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]));
+ if (compiler->is_64bit) {
+ orc_arm64_emit_load_reg (compiler, 32,
+ compiler->vars[i].ptr_register,
+ compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]));
+ } else {
+ orc_arm_emit_load_reg (compiler,
+ compiler->vars[i].ptr_register,
+ compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]));
+ }
break;
case ORC_VAR_TYPE_ACCUMULATOR:
break;
@@ -476,194 +530,160 @@ enum {
LABEL_L1L2_AFTER,
};
+#define ORC_NEON_ALIGNED_DEST_CUTOFF 64
+
static void
-orc_compiler_neon_assemble (OrcCompiler *compiler)
+orc_neon64_loop_shift (OrcCompiler *compiler)
{
- int align_var;
- int align_shift;
- int var_size_shift;
- int i;
- int set_fpscr = FALSE;
-
- align_var = get_align_var (compiler);
- if (compiler->error) return;
-
- var_size_shift = get_shift (compiler->vars[align_var].size);
- align_shift = 4;
-
- compiler->vars[align_var].is_aligned = FALSE;
-
- orc_neon_emit_prologue (compiler);
-
- if (orc_program_has_float (compiler)) {
- set_fpscr = TRUE;
- ORC_ASM_CODE (compiler," vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg));
- orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12));
- ORC_ASM_CODE (compiler," push %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
- orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12));
-
- orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24);
- ORC_ASM_CODE (compiler," vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
- orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12));
- }
-
- orc_neon_load_constants_outer (compiler);
-
- if (compiler->program->is_2d) {
- if (compiler->program->constant_m > 0) {
- orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
- } else {
- orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]));
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
- }
-
- orc_arm_emit_label (compiler, LABEL_OUTER_LOOP);
- }
+ int align_var = get_align_var (compiler);
+ int var_size_shift = get_shift (compiler->vars[align_var].size);
+ int align_shift = 4;
-#define ORC_NEON_ALIGNED_DEST_CUTOFF 64
-
- if (compiler->loop_shift > 0 && compiler->n_insns < 5) {
- orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ if (compiler->n_insns < 5) {
+ /** Get the number of loops (N) from OrcExecutor */
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
(int)ORC_STRUCT_OFFSET(OrcExecutor,n));
- orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF);
+
+ /** if N > ORC_NEON_ALIGNED_DEST_CUTOFF, go to LABEL_REGION0_SKIP */
+ orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_R2, ORC_NEON_ALIGNED_DEST_CUTOFF);
orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP);
- orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3,
+ /** counter2 = N >> loop shift */
+ orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_R1, ORC_ARM64_R2,
compiler->loop_shift);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R1, compiler->exec_reg,
(int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
- orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3,
+ /** counter3 = N & loop shift */
+ orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R2,
(1<<compiler->loop_shift)-1);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
(int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+ /** load function arguments */
orc_neon_load_constants_inner (compiler);
- orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+
+ /** if counter2 == zero, go to LABEL_REGION2_SKIP */
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
(int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
- orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+ orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0);
orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
+ /** vector calculation loop */
compiler->size_region = 0;
orc_arm_emit_label (compiler, LABEL_REGION0_LOOP);
- orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
+
+ /** vector instructions: @todo port to aarch64 */
orc_neon_emit_loop (compiler, -1);
- orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
-
+ /** if counter2 != zero, repeat loop */
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
+ /** else go to LABEL_REGION2_SKIP */
orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
-
orc_arm_emit_label (compiler, LABEL_REGION0_SKIP);
}
- if (compiler->loop_shift > 0) {
- orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift);
-
- orc_arm_emit_load_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var]));
- orc_arm_emit_sub (compiler, ORC_ARM_IP, ORC_ARM_IP, ORC_ARM_A2);
- orc_arm_emit_and_imm (compiler, ORC_ARM_IP, ORC_ARM_IP,
- (1<<align_shift)-1);
- if (var_size_shift > 0) {
- orc_arm_emit_asr_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, var_size_shift);
- }
-
- orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
- orc_arm_emit_cmp (compiler, ORC_ARM_A3, ORC_ARM_IP);
- orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION);
-
- orc_arm_emit_store_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
- orc_arm_emit_sub (compiler, ORC_ARM_A2, ORC_ARM_A3, ORC_ARM_IP);
-
- orc_arm_emit_asr_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
- compiler->loop_shift + compiler->unroll_shift);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
-
- orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
- (1<<(compiler->loop_shift + compiler->unroll_shift))-1);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
-
- orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER);
- orc_arm_emit_label (compiler, LABEL_ONE_REGION);
-
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
-
- orc_arm_emit_load_imm (compiler, ORC_ARM_A3, 0);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
-
- orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER);
+ /** IP0 = 1 << align_shift */
+ orc_arm64_emit_mov_imm (compiler, 32, ORC_ARM64_IP0, 1<<align_shift);
+
+ /** r1 == ORC_VAR_D1 */
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R1, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var]));
+ /** IP0 = IP0 - r1 */
+ orc_arm64_emit_sub (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, ORC_ARM64_R1);
+ /** IP0 = IP0 & ((1 << aligned_shift) -1) */
+ orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0,
+ (1<<align_shift)-1);
+ if (var_size_shift > 0) {
+ /** IP0 = IP0 >> var_size_shift */
+ orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, var_size_shift);
}
- orc_neon_load_constants_inner (compiler);
-
- if (compiler->loop_shift > 0) {
- int save_loop_shift = compiler->loop_shift;
- compiler->loop_shift = 0;
-
- orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
-
- orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
- orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP);
-
- orc_arm_emit_label (compiler, LABEL_REGION1_LOOP);
- orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
- orc_neon_emit_loop (compiler, -1);
- orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP);
- orc_arm_emit_label (compiler, LABEL_REGION1_SKIP);
-
- compiler->loop_shift = save_loop_shift;
- compiler->vars[align_var].is_aligned = TRUE;
- }
+ /** r2 = N */
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+ /** N <= IP0, go to LABEL_ONE_REGION */
+ orc_arm64_emit_cmp (compiler, 32, ORC_ARM64_R2, ORC_ARM64_IP0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION);
+
+ /** counter1 = IP0 */
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+ /** r1 = r2 - IP0 */
+ orc_arm64_emit_sub (compiler, 32, ORC_ARM64_R1, ORC_ARM64_R2, ORC_ARM64_IP0);
+
+ /** r2 = r1 >> (loop_shift + unroll_shift) */
+ orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R1,
+ compiler->loop_shift + compiler->unroll_shift);
+ /** counter2 = r2 */
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+ /** r2 = r1 & ((1<<(loop_shift + unroll_shift))-1) */
+ orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R1,
+ (1<<(compiler->loop_shift + compiler->unroll_shift))-1);
+ /** counter3 = r2 */
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ /** go to LABEL_ONE_REGION_AFTER */
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER);
+ orc_arm_emit_label (compiler, LABEL_ONE_REGION);
+
+ /** counter1 = r2 */
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+ /** counter2 = counter3 = 0 */
+ orc_arm64_emit_mov_uimm (compiler, 32, ORC_ARM64_R2, 0);
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+ orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER);
+}
- if (compiler->loop_shift > 0) {
- orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
- } else {
- orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
- }
+static void
+orc_neon64_loop_caches (OrcCompiler *compiler)
+{
+ int align_var = get_align_var (compiler);
+ int var_size_shift = get_shift (compiler->vars[align_var].size);
+ int i;
- orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+ /** if IP0 == 0, go to LABEL_REGION2_SKIP */
+ orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0);
orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
- orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+ /** r1 = IP0 >> (17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift) */
+ orc_arm64_emit_asr_imm (compiler, 32, compiler->gp_tmpreg, ORC_ARM64_IP0,
17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
- orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+
+ /** if r1 == 0, go to LABEL_REGION2_MEDIUM */
+ orc_arm64_emit_cmp_imm (compiler, 32, compiler->gp_tmpreg, 0);
orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_MEDIUM);
- /* N is larger than L2 cache size */
+ /** N is larger than L2 cache size */
compiler->size_region = 3;
orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE);
- orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
for(i=0;i<(1<<compiler->unroll_shift);i++){
orc_neon_emit_loop (compiler, i);
}
orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE);
+ /** DONE, let's finish */
orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
orc_arm_emit_label (compiler, LABEL_REGION2_MEDIUM);
- orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+ orc_arm64_emit_asr_imm (compiler, 32, compiler->gp_tmpreg, ORC_ARM64_IP0,
13 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
- orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+ orc_arm64_emit_cmp_imm (compiler, 32, compiler->gp_tmpreg, 0);
orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SMALL);
/* N is smaller than L2 cache size */
compiler->size_region = 2;
orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM);
- orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
for(i=0;i<(1<<compiler->unroll_shift);i++){
orc_neon_emit_loop (compiler, i);
}
@@ -671,48 +691,299 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
orc_arm_emit_label (compiler, LABEL_REGION2_SMALL);
- /* N is smaller than L2 cache size */
+ /* N is smaller than L1 cache size */
compiler->size_region = 1;
orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL);
- orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
for(i=0;i<(1<<compiler->unroll_shift);i++){
orc_neon_emit_loop (compiler, i);
}
orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL);
orc_arm_emit_label (compiler, LABEL_REGION2_SKIP);
+}
+
+#define orc_neon64_loop_shift_remainder(compiler,counter,label_loop,label_skip) \
+{ \
+ int save_loop_shift = compiler->loop_shift; \
+ compiler->loop_shift = 0; \
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, \
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter)); \
+ orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0); \
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, label_skip); \
+ orc_arm_emit_label (compiler, label_loop); \
+ orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); \
+ orc_neon_emit_loop (compiler, -1); \
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, label_loop); \
+ orc_arm_emit_label (compiler, label_skip); \
+ compiler->loop_shift = save_loop_shift; \
+}
- if (compiler->loop_shift > 0) {
- int save_loop_shift = compiler->loop_shift;
+static void
+orc_compiler_neon_assemble (OrcCompiler *compiler)
+{
+ int align_var;
+ int align_shift;
+ int var_size_shift;
+ int i;
+ int set_fpscr = FALSE;
- compiler->loop_shift = 0;
+ align_var = get_align_var (compiler);
+ if (compiler->error) return;
- compiler->vars[align_var].is_aligned = FALSE;
+ var_size_shift = get_shift (compiler->vars[align_var].size);
+ align_shift = 4;
- orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+ compiler->vars[align_var].is_aligned = FALSE;
+
+ orc_neon_emit_prologue (compiler);
+
+ if (!compiler->is_64bit && orc_program_has_float (compiler)) {
+ set_fpscr = TRUE;
+ ORC_ASM_CODE (compiler," vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg));
+ orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12));
+ ORC_ASM_CODE (compiler," push %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
+ orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12));
+
+ orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24);
+ ORC_ASM_CODE (compiler," vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
+ orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12));
+ }
+
+ orc_neon_load_constants_outer (compiler);
+
+ if (compiler->is_64bit) {
+ /** @todo not supported yet */
+ if (compiler->program->is_2d) return;
+
+ if (compiler->loop_shift > 0) {
+ orc_neon64_loop_shift (compiler);
+
+ orc_neon_load_constants_inner (compiler);
+
+ orc_neon64_loop_shift_remainder (compiler, counter1,
+ LABEL_REGION1_LOOP, LABEL_REGION1_SKIP);
+ compiler->vars[align_var].is_aligned = TRUE;
+
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+ orc_neon64_loop_caches (compiler);
+
+ compiler->vars[align_var].is_aligned = FALSE;
+ orc_neon64_loop_shift_remainder (compiler, counter3,
+ LABEL_REGION3_LOOP, LABEL_REGION3_SKIP);
+ } else {
+ orc_neon_load_constants_inner (compiler);
+
+ orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+
+ orc_neon64_loop_caches (compiler);
+ }
+ } else {
+ if (compiler->program->is_2d) {
+ if (compiler->program->constant_m > 0) {
+ orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
+ } else {
+ orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]));
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
+ }
+
+ orc_arm_emit_label (compiler, LABEL_OUTER_LOOP);
+ }
+
+ if (compiler->loop_shift > 0 && compiler->n_insns < 5) {
+ orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+
+ orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP);
+
+ orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3,
+ compiler->loop_shift);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+ orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3,
+ (1<<compiler->loop_shift)-1);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ orc_neon_load_constants_inner (compiler);
+ orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+ orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
+
+ compiler->size_region = 0;
+ orc_arm_emit_label (compiler, LABEL_REGION0_LOOP);
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+
+ orc_neon_emit_loop (compiler, -1);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
+ orc_arm_emit_label (compiler, LABEL_REGION0_SKIP);
+ }
+
+ if (compiler->loop_shift > 0) {
+ orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift);
+
+ orc_arm_emit_load_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var]));
+ orc_arm_emit_sub (compiler, ORC_ARM_IP, ORC_ARM_IP, ORC_ARM_A2);
+ orc_arm_emit_and_imm (compiler, ORC_ARM_IP, ORC_ARM_IP,
+ (1<<align_shift)-1);
+ if (var_size_shift > 0) {
+ orc_arm_emit_asr_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, var_size_shift);
+ }
+
+ orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+ orc_arm_emit_cmp (compiler, ORC_ARM_A3, ORC_ARM_IP);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION);
+
+ orc_arm_emit_store_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+ orc_arm_emit_sub (compiler, ORC_ARM_A2, ORC_ARM_A3, ORC_ARM_IP);
+
+ orc_arm_emit_asr_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
+ compiler->loop_shift + compiler->unroll_shift);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+ orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
+ (1<<(compiler->loop_shift + compiler->unroll_shift))-1);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER);
+ orc_arm_emit_label (compiler, LABEL_ONE_REGION);
+
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+
+ orc_arm_emit_load_imm (compiler, ORC_ARM_A3, 0);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER);
+ }
+
+ orc_neon_load_constants_inner (compiler);
+
+ if (compiler->loop_shift > 0) {
+ int save_loop_shift = compiler->loop_shift;
+ compiler->loop_shift = 0;
+
+ orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+
+ orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP);
+
+ orc_arm_emit_label (compiler, LABEL_REGION1_LOOP);
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_neon_emit_loop (compiler, -1);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP);
+ orc_arm_emit_label (compiler, LABEL_REGION1_SKIP);
+
+ compiler->loop_shift = save_loop_shift;
+ compiler->vars[align_var].is_aligned = TRUE;
+ }
+
+ if (compiler->loop_shift > 0) {
+ orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+ } else {
+ orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+ }
orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
- orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
+
+ orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+ 17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
+ orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_MEDIUM);
- orc_arm_emit_label (compiler, LABEL_REGION3_LOOP);
+ /* N is larger than L2 cache size */
+ compiler->size_region = 3;
+ orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE);
orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
- orc_neon_emit_loop (compiler, -1);
- orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP);
- orc_arm_emit_label (compiler, LABEL_REGION3_SKIP);
+ for(i=0;i<(1<<compiler->unroll_shift);i++){
+ orc_neon_emit_loop (compiler, i);
+ }
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
- compiler->loop_shift = save_loop_shift;
- }
+ orc_arm_emit_label (compiler, LABEL_REGION2_MEDIUM);
+ orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+ 13 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
+ orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SMALL);
+
+ /* N is smaller than L2 cache size */
+ compiler->size_region = 2;
+ orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM);
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ for(i=0;i<(1<<compiler->unroll_shift);i++){
+ orc_neon_emit_loop (compiler, i);
+ }
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_MEDIUM);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
+
+ orc_arm_emit_label (compiler, LABEL_REGION2_SMALL);
+ /* N is smaller than L2 cache size */
+ compiler->size_region = 1;
+ orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL);
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ for(i=0;i<(1<<compiler->unroll_shift);i++){
+ orc_neon_emit_loop (compiler, i);
+ }
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL);
+
+ orc_arm_emit_label (compiler, LABEL_REGION2_SKIP);
- if (compiler->program->is_2d) {
- neon_add_strides (compiler);
+ if (compiler->loop_shift > 0) {
+ int save_loop_shift = compiler->loop_shift;
- orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]));
- orc_arm_emit_sub_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, 1, TRUE);
- orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
- (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
- orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_OUTER_LOOP);
+ compiler->loop_shift = 0;
+
+ compiler->vars[align_var].is_aligned = FALSE;
+
+ orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP);
+
+ orc_arm_emit_label (compiler, LABEL_REGION3_LOOP);
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_neon_emit_loop (compiler, -1);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP);
+ orc_arm_emit_label (compiler, LABEL_REGION3_SKIP);
+
+ compiler->loop_shift = save_loop_shift;
+ }
+
+ if (compiler->program->is_2d) {
+ neon_add_strides (compiler);
+
+ orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]));
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, 1, TRUE);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_OUTER_LOOP);
+ }
}
orc_neon_save_accumulators (compiler);
@@ -729,11 +1000,13 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
orc_arm_emit_align (compiler, 4);
- orc_arm_emit_label (compiler, 20);
- orc_arm_emit_data (compiler, 0x07060706);
- orc_arm_emit_data (compiler, 0x07060706);
- orc_arm_emit_data (compiler, 0x0f0e0f0e);
- orc_arm_emit_data (compiler, 0x0f0e0f0e);
+ if (!compiler->is_64bit) {
+ orc_arm_emit_label (compiler, 20);
+ orc_arm_emit_data (compiler, 0x07060706);
+ orc_arm_emit_data (compiler, 0x07060706);
+ orc_arm_emit_data (compiler, 0x0f0e0f0e);
+ orc_arm_emit_data (compiler, 0x0f0e0f0e);
+ }
orc_arm_do_fixups (compiler);
}
@@ -830,10 +1103,16 @@ orc_neon_emit_loop (OrcCompiler *compiler, int unroll_index)
if (compiler->vars[k].vartype == ORC_VAR_TYPE_SRC ||
compiler->vars[k].vartype == ORC_VAR_TYPE_DEST) {
if (compiler->vars[k].ptr_register) {
- orc_arm_emit_add_imm (compiler,
- compiler->vars[k].ptr_register,
- compiler->vars[k].ptr_register,
- compiler->vars[k].size << compiler->loop_shift);
+ if (compiler->is_64bit)
+ orc_arm64_emit_add_imm (compiler, 32,
+ compiler->vars[k].ptr_register,
+ compiler->vars[k].ptr_register,
+ compiler->vars[k].size << compiler->loop_shift);
+ else
+ orc_arm_emit_add_imm (compiler,
+ compiler->vars[k].ptr_register,
+ compiler->vars[k].ptr_register,
+ compiler->vars[k].size << compiler->loop_shift);
} else {
/* arm_emit_add_imm_memoffset (compiler, arm_ptr_size, */
/* compiler->vars[k].size << compiler->loop_shift, */