diff options
author | Marek Vasut <marex@denx.de> | 2020-08-30 19:26:15 +0300 |
---|---|---|
committer | Marek Vasut <marex@denx.de> | 2020-09-19 04:08:46 +0300 |
commit | bf33f04178ae9001f179a20acee3881b63950e74 (patch) | |
tree | 739b35cb4ed3d8b95bbcfedfa845d8c7ace44a92 | |
parent | 872da542a1ef77107b8d34de39f275824b085dc8 (diff) |
aarch64: Implement sqrtf instruction
Fill in aarch64 opcodes for sqrtf instruction.
Signed-off-by: Marek Vasut <marex@denx.de>
-rw-r--r-- | orc/orcrules-neon.c | 159 |
1 files changed, 99 insertions, 60 deletions
diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index 789f42c..7e8e3db 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -2775,78 +2775,117 @@ static void orc_neon_rule_sqrtf (OrcCompiler *p, void *user, OrcInstruction *insn) { int vec_shift = 1; - if (p->insn_shift <= vec_shift) { + + if (p->is_64bit) { + OrcVariable tmpreg = { .alloc = p->tmpreg, .size = p->vars[insn->src_args[0]].size }; + OrcVariable tmpreg2 = { .alloc = p->tmpreg2, .size = p->vars[insn->src_args[0]].size }; int i; - orc_neon_emit_unary (p, "vrsqrte.f32", 0xf3bb0580, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); + + orc_neon64_emit_unary (p, "frsqrte", 0x2ea1d800, + tmpreg, p->vars[insn->src_args[0]], + p->insn_shift); + for(i = 0; i < NUM_ITERS_SQRTF; i++) { - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->tmpreg2, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); - orc_neon_emit_binary (p, "vrsqrts.f32", 0xf2200f10, - p->tmpreg2, - p->tmpreg, - p->tmpreg2); - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->tmpreg, - p->tmpreg, - p->tmpreg2); + orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, + tmpreg2, tmpreg, p->vars[insn->src_args[0]], + p->insn_shift); + orc_neon64_emit_binary (p, "frsqrts", 0x0ea0fc00, + tmpreg2, tmpreg, tmpreg2, p->insn_shift); + orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, + tmpreg, tmpreg, tmpreg2, + p->insn_shift); } - orc_neon_emit_unary(p, "vrecpe.f32", 0xf3bb0500, - p->vars[insn->dest_args[0]].alloc, - p->tmpreg); + orc_neon64_emit_unary (p, "frecpe", 0x0ea1d800, + p->vars[insn->dest_args[0]], tmpreg, + p->insn_shift); - for(i=0; i < NUM_ITERS_DIVF; i++) { - orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10, - p->tmpreg2, /* correction factor */ - p->vars[insn->dest_args[0]].alloc, /* the last estimate */ - p->tmpreg); /* the original number */ - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->vars[insn->dest_args[0]].alloc, /* revised estimate */ - p->vars[insn->dest_args[0]].alloc, /* last estimate */ - p->tmpreg2); /* correction factor */ + for(i = 0; i < NUM_ITERS_DIVF; i++) { + orc_neon64_emit_binary (p, "frecps", 0x0e20fc00, + tmpreg2, /* correction factor */ + p->vars[insn->dest_args[0]], /* the last estimate */ + tmpreg, /* the original number */ + p->insn_shift); + orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, + p->vars[insn->dest_args[0]], /* revised estimate */ + p->vars[insn->dest_args[0]], /* last estimate */ + tmpreg2, /* correction factor */ + p->insn_shift); } - - } else if (p->insn_shift == vec_shift + 1) { - int i; - orc_neon_emit_unary_quad (p, "vrsqrte.f32", 0xf3bb0580, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); - for(i = 0; i < NUM_ITERS_SQRTF; i++) { - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->tmpreg2, + } else { + if (p->insn_shift <= vec_shift) { + int i; + orc_neon_emit_unary (p, "vrsqrte.f32", 0xf3bb0580, p->tmpreg, p->vars[insn->src_args[0]].alloc); - orc_neon_emit_binary_quad (p, "vrsqrts.f32", 0xf2200f10, - p->tmpreg2, - p->tmpreg, - p->tmpreg2); - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->tmpreg, + for(i = 0; i < NUM_ITERS_SQRTF; i++) { + orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, + p->tmpreg2, + p->tmpreg, + p->vars[insn->src_args[0]].alloc); + orc_neon_emit_binary (p, "vrsqrts.f32", 0xf2200f10, + p->tmpreg2, + p->tmpreg, + p->tmpreg2); + orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, + p->tmpreg, + p->tmpreg, + p->tmpreg2); + } + + orc_neon_emit_unary(p, "vrecpe.f32", 0xf3bb0500, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); + + for(i=0; i < NUM_ITERS_DIVF; i++) { + orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10, + p->tmpreg2, /* correction factor */ + p->vars[insn->dest_args[0]].alloc, /* the last estimate */ + p->tmpreg); /* the original number */ + orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, + p->vars[insn->dest_args[0]].alloc, /* revised estimate */ + p->vars[insn->dest_args[0]].alloc, /* last estimate */ + p->tmpreg2); /* correction factor */ + } + + } else if (p->insn_shift == vec_shift + 1) { + int i; + orc_neon_emit_unary_quad (p, "vrsqrte.f32", 0xf3bb0580, p->tmpreg, - p->tmpreg2); - } + p->vars[insn->src_args[0]].alloc); + for(i = 0; i < NUM_ITERS_SQRTF; i++) { + orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, + p->tmpreg2, + p->tmpreg, + p->vars[insn->src_args[0]].alloc); + orc_neon_emit_binary_quad (p, "vrsqrts.f32", 0xf2200f10, + p->tmpreg2, + p->tmpreg, + p->tmpreg2); + orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, + p->tmpreg, + p->tmpreg, + p->tmpreg2); + } - orc_neon_emit_unary_quad(p, "vrecpe.f32", 0xf3bb0500, - p->vars[insn->dest_args[0]].alloc, - p->tmpreg); + orc_neon_emit_unary_quad(p, "vrecpe.f32", 0xf3bb0500, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); - for(i=0; i < NUM_ITERS_DIVF; i++) { - orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10, - p->tmpreg2, /* correction factor */ - p->vars[insn->dest_args[0]].alloc, /* the last estimate */ - p->tmpreg); /* the original number */ - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->vars[insn->dest_args[0]].alloc, /* revised estimate */ - p->vars[insn->dest_args[0]].alloc, /* last estimate */ - p->tmpreg2); /* correction factor */ - } + for(i=0; i < NUM_ITERS_DIVF; i++) { + orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10, + p->tmpreg2, /* correction factor */ + p->vars[insn->dest_args[0]].alloc, /* the last estimate */ + p->tmpreg); /* the original number */ + orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, + p->vars[insn->dest_args[0]].alloc, /* revised estimate */ + p->vars[insn->dest_args[0]].alloc, /* last estimate */ + p->tmpreg2); /* correction factor */ + } - } else { - ORC_COMPILER_ERROR(p, "shift too large"); + } else { + ORC_COMPILER_ERROR(p, "shift too large"); + } } } #endif |