From f1d8693e34673b86cef1a4333a2ce58369030e5d Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 31 Aug 2020 02:26:33 +0200 Subject: aarch32: Implement loadupdb instruction Fill in aarch32 opcodes for loadupdb instruction, which is used by various color space conversion programs. There is likely still some space for optimization. --- orc/orcprogram-neon.c | 79 +++++++++----- orc/orcrules-neon.c | 293 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 254 insertions(+), 118 deletions(-) diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index 1e0ab2c..de24bdb 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -311,14 +311,12 @@ orc_compiler_neon_init (OrcCompiler *compiler) compiler->unroll_shift = 0; } - if (compiler->is_64bit) { /* The loadupdb is aarch64 only so far */ - for(i=0;in_insns;i++){ - OrcInstruction *insn = compiler->insns + i; - OrcStaticOpcode *opcode = insn->opcode; + for(i=0;in_insns;i++){ + OrcInstruction *insn = compiler->insns + i; + OrcStaticOpcode *opcode = insn->opcode; - if (strcmp (opcode->name, "loadupdb") == 0) { - compiler->vars[insn->src_args[0]].need_offset_reg = TRUE; - } + if (strcmp (opcode->name, "loadupdb") == 0) { + compiler->vars[insn->src_args[0]].need_offset_reg = TRUE; } } @@ -355,29 +353,46 @@ orc_neon_load_constants_outer (OrcCompiler *compiler) orc_compiler_emit_invariants (compiler); - if (compiler->is_64bit) { /* The loadupdb is aarch64 only so far */ - for(i=0;in_insns;i++){ - OrcInstruction *insn = compiler->insns + i; - OrcStaticOpcode *opcode = insn->opcode; + for(i=0;in_insns;i++){ + OrcInstruction *insn = compiler->insns + i; + OrcStaticOpcode *opcode = insn->opcode; - if (strcmp (opcode->name, "loadupdb") == 0) { - if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { - orc_arm64_emit_load_reg (compiler, 64, + if (strcmp (opcode->name, "loadupdb") == 0) { + if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { + if (compiler->is_64bit) { + orc_arm64_emit_load_reg (compiler, 64, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->exec_reg, + ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]])); + } else { + orc_arm_emit_load_reg (compiler, compiler->vars[insn->src_args[0]].ptr_offset, - compiler->exec_reg, + compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]])); + } + } else { + if (!compiler->vars[insn->src_args[0]].ptr_offset) + continue; + if (compiler->is_64bit) { + if (!compiler->vars[insn->src_args[1]].value.i) + orc_arm64_emit_eor(compiler, 64, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[0]].ptr_offset); + else + orc_arm64_emit_load_imm(compiler, 64, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[1]].value.i); } else { - if (!compiler->vars[insn->src_args[0]].ptr_offset) - continue; - if (!compiler->vars[insn->src_args[1]].value.i) - orc_arm64_emit_eor(compiler, 64, - compiler->vars[insn->src_args[0]].ptr_offset, - compiler->vars[insn->src_args[0]].ptr_offset, - compiler->vars[insn->src_args[0]].ptr_offset); - else - orc_arm64_emit_load_imm(compiler, 64, - compiler->vars[insn->src_args[0]].ptr_offset, - compiler->vars[insn->src_args[1]].value.i); + if (!compiler->vars[insn->src_args[1]].value.i) + orc_arm_emit_eor_r(compiler, ORC_ARM_COND_AL, 0, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[0]].ptr_offset); + else + orc_arm_emit_load_imm(compiler, + compiler->vars[insn->src_args[0]].ptr_offset, + compiler->vars[insn->src_args[1]].value.i); } } } @@ -411,6 +426,11 @@ orc_neon_load_constants_inner (OrcCompiler *compiler) orc_arm_emit_load_reg (compiler, compiler->vars[i].ptr_register, compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); + if (compiler->vars[i].ptr_offset) + orc_arm_emit_eor_r(compiler, ORC_ARM_COND_AL, 0, + compiler->vars[i].ptr_offset, + compiler->vars[i].ptr_offset, + compiler->vars[i].ptr_offset); } break; case ORC_VAR_TYPE_ACCUMULATOR: @@ -1182,10 +1202,17 @@ orc_neon_emit_loop (OrcCompiler *compiler, int unroll_index) compiler->vars[k].size << compiler->loop_shift); } } else { + if (compiler->vars[k].ptr_offset) { + orc_arm_emit_add_imm (compiler, + compiler->vars[k].ptr_offset, + compiler->vars[k].ptr_offset, + compiler->vars[k].size << compiler->loop_shift); + } else if (compiler->vars[k].ptr_register) { orc_arm_emit_add_imm (compiler, compiler->vars[k].ptr_register, compiler->vars[k].ptr_register, compiler->vars[k].size << compiler->loop_shift); + } } } } diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index 2764e9e..726f0d4 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -1080,117 +1080,226 @@ static void neon_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; + OrcVariable *dest = compiler->vars + insn->dest_args[0]; unsigned int code = 0; int size = src->size << compiler->insn_shift; ORC_ASSERT(src->ptr_register); /* can ptr_register be 0 ? */ int ptr_reg; - if (!compiler->is_64bit) { - ORC_COMPILER_ERROR(compiler, "loadupdb is implemented only on aarch64"); - return; - } - /* FIXME this should be fixed at a higher level */ if (src->vartype != ORC_VAR_TYPE_SRC && src->vartype != ORC_VAR_TYPE_DEST) { ORC_COMPILER_ERROR(compiler, "loadX used with non src/dest"); return; } - if (src->ptr_offset) { - ptr_reg = compiler->gp_tmpreg; - orc_arm64_emit_add_lsr(compiler, 64, ptr_reg, src->ptr_register, src->ptr_offset, 1); - } else { - ptr_reg = src->ptr_register; - } - - int opcode, flag; - - if (size > 16) { - /** load multiple single-element structures to one, two, three, or four registers */ - char vt_str[64]; - - memset(vt_str, '\x00', 64); - - if (size == 64) { - snprintf(vt_str, 64, "%s, %s, %s, %s", - orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1), - orc_neon64_reg_name_vector (compiler->tmpreg + 1, 1, 1), - orc_neon64_reg_name_vector (compiler->tmpreg + 2, 1, 1), - orc_neon64_reg_name_vector (compiler->tmpreg + 3, 1, 1)); - opcode = 0x2; - } else if (size == 32) { - snprintf(vt_str, 64, "%s, %s", - orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1), - orc_neon64_reg_name_vector (compiler->tmpreg + 1, 1, 1)); - opcode = 0xa; - } else if (size == 16) { - snprintf(vt_str, 64, "%s", - orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1)); - opcode = 0x7; + if (compiler->is_64bit) { + if (src->ptr_offset) { + ptr_reg = compiler->gp_tmpreg; + orc_arm64_emit_add_lsr(compiler, 64, ptr_reg, src->ptr_register, src->ptr_offset, 1); } else { - ORC_COMPILER_ERROR(compiler,"bad aligned load size %d", - src->size << compiler->insn_shift); - return; + ptr_reg = src->ptr_register; } - flag = 0; /* Bytes */ - - ORC_ASM_CODE(compiler," ld1 { %s }, [%s]\n", - vt_str, orc_arm64_reg_name (ptr_reg, 64)); - code = 0x0c400000; - code |= 0 << 30; /* Q-bit */ - code |= (flag&0x3) << 10; - code |= (opcode&0xf) << 12; - } else { - /** load one single-element structure to one lane of one register */ - flag = 0; - if (size == 8) { - opcode = 4; - flag = 1; /* size==01 */ - } else if (size == 4) { - opcode = 4; - } else if (size == 2) { - opcode = 2; - } else if (size == 1) { - opcode = 0; + + int opcode, flag; + + if (size > 16) { + /** load multiple single-element structures to one, two, three, or four registers */ + char vt_str[64]; + + memset(vt_str, '\x00', 64); + + if (size == 64) { + snprintf(vt_str, 64, "%s, %s, %s, %s", + orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 1, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 2, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 3, 1, 1)); + opcode = 0x2; + } else if (size == 32) { + snprintf(vt_str, 64, "%s, %s", + orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1), + orc_neon64_reg_name_vector (compiler->tmpreg + 1, 1, 1)); + opcode = 0xa; + } else if (size == 16) { + snprintf(vt_str, 64, "%s", + orc_neon64_reg_name_vector (compiler->tmpreg, 1, 1)); + opcode = 0x7; + } else { + ORC_COMPILER_ERROR(compiler,"bad aligned load size %d", + src->size << compiler->insn_shift); + return; + } + flag = 0; /* Bytes */ + + ORC_ASM_CODE(compiler," ld1 { %s }, [%s]\n", + vt_str, orc_arm64_reg_name (ptr_reg, 64)); + code = 0x0c400000; + code |= 0 << 30; /* Q-bit */ + code |= (flag&0x3) << 10; + code |= (opcode&0xf) << 12; } else { - ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d", - src->size << compiler->insn_shift); - return; + /** load one single-element structure to one lane of one register */ + flag = 0; + if (size == 8) { + opcode = 4; + flag = 1; /* size==01 */ + } else if (size == 4) { + opcode = 4; + } else if (size == 2) { + opcode = 2; + } else if (size == 1) { + opcode = 0; + } else { + ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d", + src->size << compiler->insn_shift); + return; + } + ORC_ASM_CODE(compiler," ld1 { %s }[0], [%s]\n", + orc_neon64_reg_name_vector_single (compiler->tmpreg, size), + orc_arm64_reg_name (ptr_reg, 64)); + code = 0x0d400000; + code |= (opcode&0x7) << 13; + code |= (flag&0x3) << 10; } - ORC_ASM_CODE(compiler," ld1 { %s }[0], [%s]\n", - orc_neon64_reg_name_vector_single (compiler->tmpreg, size), - orc_arm64_reg_name (ptr_reg, 64)); - code = 0x0d400000; - code |= (opcode&0x7) << 13; - code |= (flag&0x3) << 10; - } - code |= (ptr_reg&0x1f) << 5; - code |= (compiler->tmpreg&0x1f); + code |= (ptr_reg&0x1f) << 5; + code |= (compiler->tmpreg&0x1f); - orc_arm_emit (compiler, code); + orc_arm_emit (compiler, code); - OrcVariable tmpreg = { .alloc = compiler->tmpreg, .size = compiler->vars[insn->src_args[0]].size }; + OrcVariable tmpreg = { .alloc = compiler->tmpreg, .size = compiler->vars[insn->src_args[0]].size }; + + switch (src->size) { + case 1: + orc_neon64_emit_binary (compiler, "zip1", 0x0e003800, + compiler->vars[insn->dest_args[0]], + tmpreg, + tmpreg, compiler->insn_shift - 1); + break; + case 2: + orc_neon64_emit_binary (compiler, "zip1", 0x0e403800, + compiler->vars[insn->dest_args[0]], + tmpreg, + tmpreg, compiler->insn_shift - 1); + break; + case 4: + orc_neon64_emit_binary (compiler, "zip1", 0x0e803800, + compiler->vars[insn->dest_args[0]], + tmpreg, + tmpreg, compiler->insn_shift - 1); + break; + } + } else { + if (src->ptr_offset) { + ptr_reg = compiler->gp_tmpreg; + orc_arm_emit_add_rsi(compiler, ORC_ARM_COND_AL, 0, + ptr_reg, src->ptr_register, + src->ptr_offset, ORC_ARM_LSR, 1); + } else { + ptr_reg = src->ptr_register; + } + if (size > 8) { + if (src->is_aligned) { + if (size == 32) { + ORC_ASM_CODE(compiler," vld1.64 { %s, %s, %s, %s }, [%s,:256]\n", + orc_neon_reg_name (dest->alloc), + orc_neon_reg_name (dest->alloc + 1), + orc_neon_reg_name (dest->alloc + 2), + orc_neon_reg_name (dest->alloc + 3), + orc_arm_reg_name (ptr_reg)); + code = 0xf42002dd; + } else if (size == 16) { + ORC_ASM_CODE(compiler," vld1.64 { %s, %s }, [%s,:128]\n", + orc_neon_reg_name (dest->alloc), + orc_neon_reg_name (dest->alloc + 1), + orc_arm_reg_name (ptr_reg)); + code = 0xf4200aed; + } else if (size == 8) { + ORC_ASM_CODE(compiler," vld1.64 %s, [%s]\n", + orc_neon_reg_name (dest->alloc), + orc_arm_reg_name (ptr_reg)); + code = 0xf42007cd; + } else { + ORC_COMPILER_ERROR(compiler,"bad aligned load size %d", + src->size << compiler->insn_shift); + } + } else { + if (size == 32) { + ORC_ASM_CODE(compiler," vld1.8 { %s, %s, %s, %s }, [%s]\n", + orc_neon_reg_name (dest->alloc), + orc_neon_reg_name (dest->alloc + 1), + orc_neon_reg_name (dest->alloc + 2), + orc_neon_reg_name (dest->alloc + 3), + orc_arm_reg_name (ptr_reg)); + code = 0xf420020d; + } else if (size == 16) { + ORC_ASM_CODE(compiler," vld1.8 { %s, %s }, [%s]\n", + orc_neon_reg_name (dest->alloc), + orc_neon_reg_name (dest->alloc + 1), + orc_arm_reg_name (ptr_reg)); + code = 0xf4200a0d; + } else if (size == 8) { + ORC_ASM_CODE(compiler," vld1.8 %s, [%s]\n", + orc_neon_reg_name (dest->alloc), + orc_arm_reg_name (ptr_reg)); + code = 0xf420070d; + } else { + ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d", + src->size << compiler->insn_shift); + } + } + } else { + int shift; + if (size == 4) { + shift = 2; + } else if (size == 2) { + shift = 1; + } else { + shift = 0; + } + ORC_ASM_CODE(compiler," vld1.%d %s[0], [%s]\n", + 8<alloc), + orc_arm_reg_name (ptr_reg)); + code = 0xf4a0000d; + code |= shift<<10; + code |= (0&7)<<5; + } + code |= (ptr_reg&0xf) << 16; + code |= (dest->alloc&0xf) << 12; + code |= ((dest->alloc>>4)&0x1) << 22; + code |= 1 << 1; + orc_arm_emit (compiler, code); - switch (src->size) { - case 1: - orc_neon64_emit_binary (compiler, "zip1", 0x0e003800, - compiler->vars[insn->dest_args[0]], - tmpreg, - tmpreg, compiler->insn_shift - 1); - break; - case 2: - orc_neon64_emit_binary (compiler, "zip1", 0x0e403800, - compiler->vars[insn->dest_args[0]], - tmpreg, - tmpreg, compiler->insn_shift - 1); - break; - case 4: - orc_neon64_emit_binary (compiler, "zip1", 0x0e803800, - compiler->vars[insn->dest_args[0]], - tmpreg, - tmpreg, compiler->insn_shift - 1); - break; + switch (src->size) { + case 1: + orc_neon_emit_binary (compiler, "vorr", 0xf2200110, + compiler->vars[insn->dest_args[0]].alloc + 1, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc); + orc_neon_emit_unary (compiler, "vzip.8", 0xf3b20180, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc + 1); + break; + case 2: + orc_neon_emit_binary (compiler, "vorr", 0xf2200110, + compiler->vars[insn->dest_args[0]].alloc + 1, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc); + orc_neon_emit_unary (compiler, "vzip.16", 0xf3b60180, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc + 1); + break; + case 4: + orc_neon_emit_binary (compiler, "vorr", 0xf2200110, + compiler->vars[insn->dest_args[0]].alloc + 1, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc); + orc_neon_emit_unary_quad (compiler, "vzip.32", 0xf3ba0180, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc + 1); + break; + } } src->update_type = 1; -- cgit v1.2.3