#include "config.h" #include #include #include #include #include #include #include #undef MMX #define SIZE 65536 /* sse rules */ static void sse_rule_loadpX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int reg; int size = ORC_PTR_TO_INT(user); if (src->vartype == ORC_VAR_TYPE_PARAM) { reg = dest->alloc; if (size == 8 && src->size == 8) { ORC_COMPILER_ERROR(compiler,"64-bit parameters not implemented"); } orc_x86_emit_mov_memoffset_sse (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0]]), compiler->exec_reg, reg, FALSE); if (size == 1) { orc_sse_emit_punpcklbw (compiler, reg, reg); } #ifndef MMX if (size <= 2) { orc_sse_emit_pshuflw (compiler, 0, reg, reg); } orc_sse_emit_pshufd (compiler, 0, reg, reg); #else if (size <= 2) { orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(0,0,0,0), reg, reg); } else { orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg); } #endif } else if (src->vartype == ORC_VAR_TYPE_CONST) { if (size == 8 && src->size == 8) { ORC_COMPILER_ERROR(compiler,"64-bit constants not implemented"); } sse_load_constant (compiler, dest->alloc, size, src->value.i); } else { ORC_ASSERT(0); } } static void sse_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; offset = compiler->offset * src->size; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, compiler->gp_tmpreg); orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); break; case 2: orc_sse_emit_pxor (compiler, dest->alloc, dest->alloc); orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); break; case 4: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, src->is_aligned); break; default: ORC_COMPILER_ERROR(compiler,"bad load size %d", src->size << compiler->loop_shift); break; } src->update_type = 2; } static void sse_rule_loadoffX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; if (compiler->vars[insn->src_args[1]].vartype != ORC_VAR_TYPE_CONST) { ORC_COMPILER_ERROR(compiler, "Rule only works with consts"); return; } offset = (compiler->offset + compiler->vars[insn->src_args[1]].value.i) * src->size; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, compiler->gp_tmpreg); orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); break; case 2: orc_sse_emit_pxor (compiler, dest->alloc, dest->alloc); orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); break; case 4: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, src->is_aligned); break; default: ORC_COMPILER_ERROR(compiler,"bad load size %d", src->size << compiler->loop_shift); break; } src->update_type = 2; } static void sse_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; int tmp = orc_compiler_get_temp_reg (compiler); offset = (compiler->offset * src->size) >> 1; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: case 2: orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); orc_sse_emit_movdqa (compiler, dest->alloc, tmp); orc_sse_emit_psrlw (compiler, 8, tmp); break; case 4: orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); orc_sse_emit_pinsrw_memoffset (compiler, 0, offset + 1, ptr_reg, tmp); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, FALSE); orc_x86_emit_mov_memoffset_sse (compiler, 4, offset + 1, ptr_reg, tmp, FALSE); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, FALSE); orc_x86_emit_mov_memoffset_sse (compiler, 8, offset + 1, ptr_reg, tmp, FALSE); break; case 32: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, FALSE); orc_x86_emit_mov_memoffset_sse (compiler, 16, offset + 1, ptr_reg, tmp, FALSE); break; default: ORC_COMPILER_ERROR(compiler,"bad load size %d", src->size << compiler->loop_shift); break; } orc_sse_emit_pavgb (compiler, dest->alloc, tmp); orc_sse_emit_punpcklbw (compiler, tmp, dest->alloc); src->update_type = 1; } static void sse_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int ptr_reg; int offset = 0; offset = (compiler->offset * src->size) >> 1; if (src->ptr_register == 0) { int i = insn->src_args[0]; orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = src->ptr_register; } switch (src->size << compiler->loop_shift) { case 1: case 2: orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, compiler->gp_tmpreg); orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); break; case 4: orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc); break; case 8: orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 16: orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, dest->alloc, src->is_aligned); break; case 32: orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, dest->alloc, src->is_aligned); break; default: ORC_COMPILER_ERROR(compiler,"bad load size %d", src->size << compiler->loop_shift); break; } switch (src->size) { case 1: orc_sse_emit_punpcklbw (compiler, dest->alloc, dest->alloc); break; case 2: orc_sse_emit_punpcklwd (compiler, dest->alloc, dest->alloc); break; case 4: orc_sse_emit_punpckldq (compiler, dest->alloc, dest->alloc); break; } src->update_type = 1; } static void sse_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int offset; int ptr_reg; offset = compiler->offset * dest->size; if (dest->ptr_register == 0) { orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, dest->ptr_offset, compiler->exec_reg, compiler->gp_tmpreg); ptr_reg = compiler->gp_tmpreg; } else { ptr_reg = dest->ptr_register; } switch (dest->size << compiler->loop_shift) { case 1: /* FIXME we might be using ecx twice here */ if (ptr_reg == compiler->gp_tmpreg) { ORC_COMPILER_ERROR(compiler,"unimplemented"); } orc_x86_emit_mov_sse_reg (compiler, src->alloc, compiler->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (compiler, 1, compiler->gp_tmpreg, offset, ptr_reg); break; case 2: if (compiler->target_flags & ORC_TARGET_SSE_SSE4_1) { orc_sse_emit_pextrw_memoffset (compiler, 0, src->alloc, offset, ptr_reg); } else { /* FIXME we might be using ecx twice here */ if (ptr_reg == compiler->gp_tmpreg) { ORC_COMPILER_ERROR(compiler,"unimplemented"); } orc_x86_emit_mov_sse_reg (compiler, src->alloc, compiler->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (compiler, 2, compiler->gp_tmpreg, offset, ptr_reg); } break; case 4: orc_x86_emit_mov_sse_memoffset (compiler, 4, src->alloc, offset, ptr_reg, dest->is_aligned, dest->is_uncached); break; case 8: orc_x86_emit_mov_sse_memoffset (compiler, 8, src->alloc, offset, ptr_reg, dest->is_aligned, dest->is_uncached); break; case 16: orc_x86_emit_mov_sse_memoffset (compiler, 16, src->alloc, offset, ptr_reg, dest->is_aligned, dest->is_uncached); break; default: ORC_COMPILER_ERROR(compiler,"bad size"); break; } dest->update_type = 2; } #if try1 static void sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int tmp2 = orc_compiler_get_temp_reg (compiler); int tmpc; orc_x86_emit_mov_sse_reg (compiler, X86_XMM6, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); ORC_ASM_CODE(compiler," movdqu 0(%%%s,%%%s,4), %%%s\n", orc_x86_get_regname_ptr(compiler, src->ptr_register), orc_x86_get_regname_ptr(compiler, compiler->gp_tmpreg), orc_x86_get_regname_sse(dest->alloc)); *compiler->codeptr++ = 0xf3; orc_x86_emit_rex(compiler, 0, dest->ptr_register, 0, dest->alloc); *compiler->codeptr++ = 0x0f; *compiler->codeptr++ = 0x6f; orc_x86_emit_modrm_memindex (compiler, dest->alloc, 0, src->ptr_register, compiler->gp_tmpreg, 2); #if 0 orc_sse_emit_movdqa (compiler, X86_XMM6, tmp); orc_sse_emit_pslld (compiler, 10, tmp); orc_sse_emit_psrld (compiler, 26, tmp); orc_sse_emit_pslld (compiler, 2, tmp); orc_sse_emit_movdqa (compiler, tmp, tmp2); orc_sse_emit_pslld (compiler, 8, tmp2); orc_sse_emit_por (compiler, tmp2, tmp); orc_sse_emit_movdqa (compiler, tmp, tmp2); orc_sse_emit_pslld (compiler, 16, tmp2); orc_sse_emit_por (compiler, tmp2, tmp); #else orc_sse_emit_movdqa (compiler, X86_XMM6, tmp); tmpc = orc_compiler_get_constant_long (compiler, 0x02020202, 0x06060606, 0x0a0a0a0a, 0x0e0e0e0e); orc_sse_emit_pshufb (compiler, tmpc, tmp); orc_sse_emit_paddb (compiler, tmp, tmp); orc_sse_emit_paddb (compiler, tmp, tmp); #endif orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(0,0,0,0), tmp, tmp2); orc_sse_emit_psubd (compiler, tmp2, tmp); tmpc = orc_compiler_get_constant (compiler, 4, 0x03020100); orc_sse_emit_paddd (compiler, tmpc, tmp); orc_sse_emit_pshufb (compiler, tmp, dest->alloc); orc_sse_emit_movdqa (compiler, X86_XMM7, tmp); orc_sse_emit_pslld (compiler, compiler->loop_shift, tmp); orc_sse_emit_paddd (compiler, tmp, X86_XMM6); src->update_type = 0; } #endif static void sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; int increment_var = insn->src_args[2]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int i; for(i=0;i<(1<loop_shift);i++){ if (i == 0) { orc_x86_emit_mov_memoffset_sse (compiler, 4, 0, src->ptr_register, dest->alloc, FALSE); } else { orc_x86_emit_mov_memindex_sse (compiler, 4, 0, src->ptr_register, compiler->gp_tmpreg, 2, tmp, FALSE); #ifdef MMX //orc_mmx_emit_punpckldq (compiler, tmp, dest->alloc); orc_sse_emit_psllq (compiler, 8*4*i, tmp); orc_sse_emit_por (compiler, tmp, dest->alloc); #else orc_sse_emit_pslldq (compiler, 4*i, tmp); orc_sse_emit_por (compiler, tmp, dest->alloc); #endif } if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); } orc_x86_emit_add_reg_reg_shift (compiler, 4, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); src->update_type = 0; } static void sse_rule_ldreslinl (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; int increment_var = insn->src_args[2]; OrcVariable *dest = compiler->vars + insn->dest_args[0]; int tmp = orc_compiler_get_temp_reg (compiler); int tmp2 = orc_compiler_get_temp_reg (compiler); int i; if (compiler->loop_shift == 0) { orc_x86_emit_mov_memoffset_sse (compiler, 8, 0, src->ptr_register, tmp, FALSE); orc_sse_emit_pxor (compiler, tmp2, tmp2); orc_sse_emit_punpcklbw (compiler, tmp2, tmp); orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(3,2,3,2), tmp, tmp2); orc_sse_emit_psubw (compiler, tmp, tmp2); orc_x86_emit_mov_reg_sse (compiler, src->ptr_offset, tmp); orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(0,0,0,0), tmp, tmp); orc_sse_emit_psrlw (compiler, 8, tmp); orc_sse_emit_pmullw (compiler, tmp2, tmp); orc_sse_emit_psraw (compiler, 8, tmp); orc_sse_emit_pxor (compiler, tmp2, tmp2); orc_sse_emit_packsswb (compiler, tmp2, tmp); orc_x86_emit_mov_memoffset_sse (compiler, 4, 0, src->ptr_register, dest->alloc, FALSE); orc_sse_emit_paddb (compiler, tmp, dest->alloc); if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_add_reg_reg_shift (compiler, 4, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); } else { int tmp3 = orc_compiler_get_temp_reg (compiler); int tmp4 = orc_compiler_get_temp_reg (compiler); for(i=0;i<(1<loop_shift);i+=2){ orc_x86_emit_mov_memoffset_sse (compiler, 8, 0, src->ptr_register, tmp, FALSE); orc_x86_emit_mov_reg_sse (compiler, src->ptr_offset, tmp4); if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_mov_memindex_sse (compiler, 8, 0, src->ptr_register, compiler->gp_tmpreg, 2, tmp2, FALSE); orc_sse_emit_punpckldq (compiler, tmp2, tmp); orc_sse_emit_movdqa (compiler, tmp, tmp2); if (i == 0) { orc_sse_emit_movdqa (compiler, tmp, dest->alloc); } else { orc_sse_emit_punpcklqdq (compiler, tmp, dest->alloc); } orc_sse_emit_pxor (compiler, tmp3, tmp3); orc_sse_emit_punpcklbw (compiler, tmp3, tmp); orc_sse_emit_punpckhbw (compiler, tmp3, tmp2); orc_sse_emit_psubw (compiler, tmp, tmp2); ORC_ASM_CODE(compiler," pinsrw $%d, %%%s, %%%s\n", 1, orc_x86_get_regname (src->ptr_offset), orc_x86_get_regname_sse(tmp4)); *compiler->codeptr++ = 0x66; orc_x86_emit_rex (compiler, 0, tmp4, 0, src->ptr_offset); *compiler->codeptr++ = 0x0f; *compiler->codeptr++ = 0xc4; orc_x86_emit_modrm_reg (compiler, src->ptr_offset, tmp4); *compiler->codeptr++ = 1; #if 0 orc_sse_emit_punpcklwd (compiler, tmp4, tmp4); orc_sse_emit_punpckldq (compiler, tmp4, tmp4); #else orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(1,1,0,0), tmp4, tmp4); orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,1,0,0), tmp4, tmp4); #endif orc_sse_emit_psrlw (compiler, 8, tmp4); orc_sse_emit_pmullw (compiler, tmp4, tmp2); orc_sse_emit_psraw (compiler, 8, tmp2); orc_sse_emit_pxor (compiler, tmp, tmp); orc_sse_emit_packsswb (compiler, tmp, tmp2); if (i != 0) { orc_sse_emit_pslldq (compiler, 8, tmp2); } orc_sse_emit_paddb (compiler, tmp2, dest->alloc); if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) { orc_x86_emit_add_memoffset_reg (compiler, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]), compiler->exec_reg, src->ptr_offset); } else { orc_x86_emit_add_imm_reg (compiler, 4, compiler->vars[increment_var].value.i, src->ptr_offset, FALSE); } orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg); orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg); orc_x86_emit_add_reg_reg_shift (compiler, 4, compiler->gp_tmpreg, src->ptr_register, 2); orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset); } } src->update_type = 0; } static void sse_rule_copyx (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->vars[insn->src_args[0]].alloc == p->vars[insn->dest_args[0]].alloc) { return; } orc_sse_emit_movdqa (p, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } #define UNARY(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_660f (p, insn_name, code, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } #define BINARY(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_660f (p, insn_name, code, \ p->vars[insn->src_args[1]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } UNARY(absb,"pabsb",0x381c) BINARY(addb,"paddb",0xfc) BINARY(addssb,"paddsb",0xec) BINARY(addusb,"paddusb",0xdc) BINARY(andb,"pand",0xdb) BINARY(andnb,"pandn",0xdf) BINARY(avgub,"pavgb",0xe0) BINARY(cmpeqb,"pcmpeqb",0x74) BINARY(cmpgtsb,"pcmpgtb",0x64) BINARY(maxsb,"pmaxsb",0x383c) BINARY(maxub,"pmaxub",0xde) BINARY(minsb,"pminsb",0x3838) BINARY(minub,"pminub",0xda) //BINARY(mullb,"pmullb",0xd5) //BINARY(mulhsb,"pmulhb",0xe5) //BINARY(mulhub,"pmulhub",0xe4) BINARY(orb,"por",0xeb) //UNARY(signb,"psignb",0x3808) BINARY(subb,"psubb",0xf8) BINARY(subssb,"psubsb",0xe8) BINARY(subusb,"psubusb",0xd8) BINARY(xorb,"pxor",0xef) UNARY(absw,"pabsw",0x381d) BINARY(addw,"paddw",0xfd) BINARY(addssw,"paddsw",0xed) BINARY(addusw,"paddusw",0xdd) BINARY(andw,"pand",0xdb) BINARY(andnw,"pandn",0xdf) BINARY(avguw,"pavgw",0xe3) BINARY(cmpeqw,"pcmpeqw",0x75) BINARY(cmpgtsw,"pcmpgtw",0x65) BINARY(maxsw,"pmaxsw",0xee) BINARY(maxuw,"pmaxuw",0x383e) BINARY(minsw,"pminsw",0xea) BINARY(minuw,"pminuw",0x383a) BINARY(mullw,"pmullw",0xd5) BINARY(mulhsw,"pmulhw",0xe5) BINARY(mulhuw,"pmulhuw",0xe4) BINARY(orw,"por",0xeb) //UNARY(signw,"psignw",0x3809) BINARY(subw,"psubw",0xf9) BINARY(subssw,"psubsw",0xe9) BINARY(subusw,"psubusw",0xd9) BINARY(xorw,"pxor",0xef) UNARY(absl,"pabsd",0x381e) BINARY(addl,"paddd",0xfe) //BINARY(addssl,"paddsd",0xed) //BINARY(addusl,"paddusd",0xdd) BINARY(andl,"pand",0xdb) BINARY(andnl,"pandn",0xdf) //BINARY(avgul,"pavgd",0xe3) BINARY(cmpeql,"pcmpeqd",0x76) BINARY(cmpgtsl,"pcmpgtd",0x66) BINARY(maxsl,"pmaxsd",0x383d) BINARY(maxul,"pmaxud",0x383f) BINARY(minsl,"pminsd",0x3839) BINARY(minul,"pminud",0x383b) BINARY(mulll,"pmulld",0x3840) //BINARY(mulhsl,"pmulhd",0xe5) //BINARY(mulhul,"pmulhud",0xe4) BINARY(orl,"por",0xeb) //UNARY(signl,"psignd",0x380a) BINARY(subl,"psubd",0xfa) //BINARY(subssl,"psubsd",0xe9) //BINARY(subusl,"psubusd",0xd9) BINARY(xorl,"pxor",0xef) BINARY(andq,"pand",0xdb) BINARY(andnq,"pandn",0xdf) BINARY(orq,"por",0xeb) BINARY(xorq,"pxor",0xef) BINARY(cmpeqq,"pcmpeqq",0x3829) BINARY(cmpgtsq,"pcmpgtq",0x3837) BINARY(addq,"paddq",0xd4) BINARY(subq,"psubq",0xfb) static void sse_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_paddw (p, src, dest); } static void sse_rule_accl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX if (p->loop_shift == 0) { orc_sse_emit_pslldq (p, 12, src); } #endif orc_sse_emit_paddd (p, src, dest); } static void sse_rule_accsadubl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src1 = p->vars[insn->src_args[0]].alloc; int src2 = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); #ifndef MMX if (p->loop_shift <= 2) { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_pslldq (p, 16 - (1<loop_shift), tmp); orc_sse_emit_movdqa (p, src2, tmp2); orc_sse_emit_pslldq (p, 16 - (1<loop_shift), tmp2); orc_sse_emit_psadbw (p, tmp2, tmp); } else if (p->loop_shift == 3) { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psadbw (p, src2, tmp); orc_sse_emit_pslldq (p, 8, tmp); } else { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psadbw (p, src2, tmp); } #else if (p->loop_shift <= 2) { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psllq (p, 8*(8 - (1<loop_shift)), tmp); orc_sse_emit_movdqa (p, src2, tmp2); orc_sse_emit_psllq (p, 8*(8 - (1<loop_shift)), tmp2); orc_sse_emit_psadbw (p, tmp2, tmp); } else { orc_sse_emit_movdqa (p, src1, tmp); orc_sse_emit_psadbw (p, src2, tmp); } #endif orc_sse_emit_paddd (p, tmp, dest); } static void sse_rule_signX_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; const char * names[] = { "psignb", "psignw", "psignd" }; int codes[] = { 0x3808, 0x3809, 0x380a }; int type = ORC_PTR_TO_INT(user); int tmpc; tmpc = orc_compiler_get_temp_constant (p, 1<vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 2, 0x0001); orc_sse_emit_pminsw (p, tmp, dest); tmp = orc_compiler_get_constant (p, 2, 0xffff); orc_sse_emit_pmaxsw (p, tmp, dest); } static void sse_rule_absb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_pxor (p, tmp, tmp); orc_sse_emit_pcmpgtb (p, src, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_psubb (p, tmp, dest); } static void sse_rule_absw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (src == dest) { orc_sse_emit_movdqa (p, src, tmp); } else { orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, tmp, dest); } orc_sse_emit_psraw (p, 15, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_psubw (p, tmp, dest); } static void sse_rule_absl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (src == dest) { orc_sse_emit_movdqa (p, src, tmp); } else { orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, tmp, dest); } orc_sse_emit_psrad (p, 31, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_psubd (p, tmp, dest); } static void sse_rule_shift (OrcCompiler *p, void *user, OrcInstruction *insn) { int type = ORC_PTR_TO_INT(user); int imm_code1[] = { 0x71, 0x71, 0x71, 0x72, 0x72, 0x72, 0x73, 0x73 }; int imm_code2[] = { 6, 2, 4, 6, 2, 4, 6, 2 }; int reg_code[] = { 0xf1, 0xd1, 0xe1, 0xf2, 0xd2, 0xe2, 0xf3, 0xd3 }; const char *code[] = { "psllw", "psrlw", "psraw", "pslld", "psrld", "psrad", "psllq", "psrlq" }; if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_shiftimm (p, code[type], imm_code1[type], imm_code2[type], p->vars[insn->src_args[1]].value.i, p->vars[insn->dest_args[0]].alloc); } else if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) { int tmp = orc_compiler_get_temp_reg (p); /* FIXME this is a gross hack to reload the register with a * 64-bit version of the parameter. */ orc_x86_emit_mov_memoffset_sse (p, 4, (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]), p->exec_reg, tmp, FALSE); orc_sse_emit_660f (p, code[type], reg_code[type], tmp, p->vars[insn->dest_args[0]].alloc); } else { ORC_COMPILER_ERROR(p,"rule only works with constants or params"); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shlb (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_psllw (p, p->vars[insn->src_args[1]].value.i, dest); tmp = orc_compiler_get_constant (p, 1, 0xff&(0xff<vars[insn->src_args[1]].value.i)); orc_sse_emit_pand (p, tmp, dest); } else { ORC_COMPILER_ERROR(p,"rule only works with constants"); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shrsb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_psraw (p, p->vars[insn->src_args[1]].value.i, tmp); orc_sse_emit_psrlw (p, 8, tmp); orc_sse_emit_psraw (p, 8 + p->vars[insn->src_args[1]].value.i, dest); orc_sse_emit_psllw (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } else { ORC_COMPILER_ERROR(p,"rule only works with constants"); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shrub (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { orc_sse_emit_psrlw (p, p->vars[insn->src_args[1]].value.i, dest); tmp = orc_compiler_get_constant (p, 1, (0xff>>p->vars[insn->src_args[1]].value.i)); orc_sse_emit_pand (p, tmp, dest); } else { ORC_COMPILER_ERROR(p,"rule only works with constants"); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_shrsq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) { #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,3,1,1), src, tmp); #else orc_mmx_emit_pshufw (p, ORC_MMX_SHUF(3,2,3,2), src, tmp); #endif orc_sse_emit_psrad (p, 31, tmp); orc_sse_emit_psllq (p, 64-p->vars[insn->src_args[1]].value.i, tmp); orc_sse_emit_psrlq (p, p->vars[insn->src_args[1]].value.i, dest); orc_sse_emit_por (p, tmp, dest); } else { ORC_COMPILER_ERROR(p,"rule only works with constants"); p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE; } } static void sse_rule_convsbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, src, dest); orc_sse_emit_psraw (p, 8, dest); } static void sse_rule_convubw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* FIXME need a zero register */ if (0) { orc_sse_emit_punpcklbw (p, src, dest); orc_sse_emit_psrlw (p, 8, dest); } else { orc_sse_emit_pxor(p, tmp, tmp); orc_sse_emit_punpcklbw (p, tmp, dest); } } static void sse_rule_convssswb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packsswb (p, src, dest); } static void sse_rule_convsuswb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packuswb (p, src, dest); } static void sse_rule_convuuswb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, src, dest); orc_sse_emit_psrlw (p, 15, tmp); orc_sse_emit_psllw (p, 14, tmp); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_psllw (p, 1, tmp); orc_sse_emit_pxor (p, tmp, dest); orc_sse_emit_packuswb (p, dest, dest); } static void sse_rule_convwb (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psllw (p, 8, dest); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_packuswb (p, dest, dest); } static void sse_rule_convhwb (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_packuswb (p, dest, dest); } static void sse_rule_convswl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklwd (p, src, dest); orc_sse_emit_psrad (p, 16, dest); } static void sse_rule_convuwl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* FIXME need a zero register */ if (0) { orc_sse_emit_punpcklwd (p, src, dest); orc_sse_emit_psrld (p, 16, dest); } else { orc_sse_emit_pxor(p, tmp, tmp); orc_sse_emit_punpcklwd (p, tmp, dest); } } static void sse_rule_convlw (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_pslld (p, 16, dest); orc_sse_emit_psrad (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_convhlw (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_psrad (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_convssslw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packssdw (p, src, dest); } static void sse_rule_convsuslw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_packusdw (p, src, dest); } static void sse_rule_convslq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psrad (p, 31, tmp); orc_sse_emit_punpckldq (p, tmp, dest); } static void sse_rule_convulq (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 4, 0); orc_sse_emit_punpckldq (p, tmp, dest); } static void sse_rule_convql (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest); #else orc_sse_emit_movdqa (p, src, dest); #endif } static void sse_rule_splatw3q (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; #ifndef MMX orc_sse_emit_pshuflw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest); orc_sse_emit_pshufhw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest); #else orc_mmx_emit_pshufw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest); #endif } static void sse_rule_splatbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, dest, dest); } static void sse_rule_splatbl (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, dest, dest); orc_sse_emit_punpcklwd (p, dest, dest); } static void sse_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmpc; tmpc = orc_compiler_get_constant (p, 2, 0x0080); orc_sse_emit_paddw (p, tmpc, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psrlw (p, 8, tmp); orc_sse_emit_paddw (p, tmp, dest); orc_sse_emit_psrlw (p, 8, dest); } #if 1 static void sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) { /* About 5.2 cycles per array member on ginger */ int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int a = orc_compiler_get_temp_reg (p); int j = orc_compiler_get_temp_reg (p); int j2 = orc_compiler_get_temp_reg (p); int l = orc_compiler_get_temp_reg (p); int divisor = orc_compiler_get_temp_reg (p); int tmp; int i; orc_sse_emit_movdqa (p, src, divisor); orc_sse_emit_psllw (p, 8, divisor); orc_sse_emit_psrlw (p, 1, divisor); sse_load_constant (p, a, 2, 0x00ff); tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_movdqa (p, tmp, j); orc_sse_emit_psrlw (p, 8, j); orc_sse_emit_pxor (p, tmp, dest); for(i=0;i<7;i++){ orc_sse_emit_movdqa (p, divisor, l); orc_sse_emit_pxor (p, tmp, l); orc_sse_emit_pcmpgtw (p, dest, l); orc_sse_emit_movdqa (p, l, j2); orc_sse_emit_pandn (p, divisor, l); orc_sse_emit_psubw (p, l, dest); orc_sse_emit_psrlw (p, 1, divisor); orc_sse_emit_pand (p, j, j2); orc_sse_emit_pxor (p, j2, a); orc_sse_emit_psrlw (p, 1, j); } orc_sse_emit_movdqa (p, divisor, l); orc_sse_emit_pxor (p, tmp, l); orc_sse_emit_pcmpgtw (p, dest, l); orc_sse_emit_pand (p, j, l); orc_sse_emit_pxor (p, l, a); orc_sse_emit_movdqa (p, a, dest); } #else static void sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) { /* About 8.4 cycles per array member on ginger */ int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int b = orc_compiler_get_temp_reg (p); int a = orc_compiler_get_temp_reg (p); int k = orc_compiler_get_temp_reg (p); int j = orc_compiler_get_temp_reg (p); int tmp; int i; orc_sse_emit_movdqa (p, dest, b); tmp = orc_compiler_get_constant (p, 2, 0x00ff); orc_sse_emit_pand (p, tmp, src); tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor (p, tmp, b); orc_sse_emit_pxor (p, a, a); orc_sse_emit_movdqa (p, tmp, j); orc_sse_emit_psrlw (p, 8, j); for(i=0;i<8;i++){ orc_sse_emit_por (p, j, a); orc_sse_emit_movdqa (p, a, k); orc_sse_emit_pmullw (p, src, k); orc_sse_emit_pxor (p, tmp, k); orc_sse_emit_pcmpgtw (p, b, k); orc_sse_emit_pand (p, j, k); orc_sse_emit_pxor (p, k, a); orc_sse_emit_psrlw (p, 1, j); } orc_sse_emit_movdqa (p, a, dest); } #endif static void sse_rule_mulsbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_punpcklbw (p, src, tmp); orc_sse_emit_psraw (p, 8, tmp); orc_sse_emit_punpcklbw (p, dest, dest); orc_sse_emit_psraw (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); } static void sse_rule_mulubw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_punpcklbw (p, src, tmp); orc_sse_emit_psrlw (p, 8, tmp); orc_sse_emit_punpcklbw (p, dest, dest); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); } static void sse_rule_mullb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pmullw (p, src, dest); orc_sse_emit_psllw (p, 8, dest); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_psraw (p, 8, tmp2); orc_sse_emit_psraw (p, 8, tmp); orc_sse_emit_pmullw (p, tmp2, tmp); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_mulhsb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, dest, tmp2); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_psraw (p, 8, tmp); orc_sse_emit_psllw (p, 8, dest); orc_sse_emit_psraw (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psraw (p, 8, tmp); orc_sse_emit_psraw (p, 8, tmp2); orc_sse_emit_pmullw (p, tmp, tmp2); orc_sse_emit_psrlw (p, 8, tmp2); orc_sse_emit_psllw (p, 8, tmp2); orc_sse_emit_por (p, tmp2, dest); } static void sse_rule_mulhub (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_movdqa (p, dest, tmp2); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_psrlw (p, 8, tmp); orc_sse_emit_psllw (p, 8, dest); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_pmullw (p, tmp, dest); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psrlw (p, 8, tmp); orc_sse_emit_psrlw (p, 8, tmp2); orc_sse_emit_pmullw (p, tmp, tmp2); orc_sse_emit_psrlw (p, 8, tmp2); orc_sse_emit_psllw (p, 8, tmp2); orc_sse_emit_por (p, tmp2, dest); } static void sse_rule_mulswl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pmulhw (p, src, tmp); orc_sse_emit_pmullw (p, src, dest); orc_sse_emit_punpcklwd (p, tmp, dest); } static void sse_rule_muluwl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pmulhuw (p, src, tmp); orc_sse_emit_pmullw (p, src, dest); orc_sse_emit_punpcklwd (p, tmp, dest); } static void sse_rule_mulll_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, offset + 16, p->exec_reg, FALSE, FALSE); for(i=0;i<(1<loop_shift);i++) { orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, p->gp_tmpreg); orc_x86_emit_imul_memoffset_reg (p, 4, offset + 16+4*i, p->exec_reg, p->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (p, 4, p->gp_tmpreg, offset + 4*i, p->exec_reg); } orc_x86_emit_mov_memoffset_sse (p, 16, offset, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); } #ifndef MMX static void sse_rule_mulhsl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, tmp); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), src, tmp2); orc_sse_emit_pmuldq (p, src, dest); orc_sse_emit_pmuldq (p, tmp, tmp2); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), dest, dest); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), tmp2, tmp2); orc_sse_emit_punpckldq (p, tmp2, dest); } #endif static void sse_rule_mulhsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int i; int regsize = p->is_64bit ? 8 : 4; int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, offset, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, offset + 16, p->exec_reg, FALSE, FALSE); orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EAX, offset + 32, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EDX, offset + 40, p->exec_reg); for(i=0;i<(1<loop_shift);i++) { orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, X86_EAX); ORC_ASM_CODE(p," imull %d(%%%s)\n", offset + 16 + 4*i, orc_x86_get_regname_ptr(p, p->exec_reg)); orc_x86_emit_rex(p, 4, 0, 0, p->exec_reg); *p->codeptr++ = 0xf7; orc_x86_emit_modrm_memoffset (p, 5, offset + 16 + 4*i, p->exec_reg); orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, offset + 4*i, p->exec_reg); } orc_x86_emit_mov_memoffset_sse (p, 16, offset, p->exec_reg, p->vars[insn->dest_args[0]].alloc, FALSE); orc_x86_emit_mov_memoffset_reg (p, 8, offset + 32, p->exec_reg, X86_EAX); orc_x86_emit_mov_memoffset_reg (p, 8, offset + 40, p->exec_reg, X86_EDX); } #ifndef MMX static void sse_rule_mulhul (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, tmp); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), src, tmp2); orc_sse_emit_pmuludq (p, src, dest); orc_sse_emit_pmuludq (p, tmp, tmp2); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), dest, dest); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), tmp2, tmp2); orc_sse_emit_punpckldq (p, tmp2, dest); } #endif static void sse_rule_select0lw (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ /* same as convlw */ orc_sse_emit_pslld (p, 16, dest); orc_sse_emit_psrad (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_select1lw (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ orc_sse_emit_psrad (p, 16, dest); orc_sse_emit_packssdw (p, dest, dest); } static void sse_rule_select0wb (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ /* same as convwb */ orc_sse_emit_psllw (p, 8, dest); orc_sse_emit_psraw (p, 8, dest); orc_sse_emit_packsswb (p, dest, dest); } static void sse_rule_select1wb (OrcCompiler *p, void *user, OrcInstruction *insn) { //int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; /* FIXME slow */ orc_sse_emit_psraw (p, 8, dest); orc_sse_emit_packsswb (p, dest, dest); } static void sse_rule_splitql (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest1 = p->vars[insn->dest_args[0]].alloc; int dest2 = p->vars[insn->dest_args[1]].alloc; #ifndef MMX orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest2); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,1,3,1), src, dest1); #else orc_sse_emit_movdqa (p, src, dest2); orc_sse_emit_pshufw (p, ORC_SSE_SHUF(3,2,3,2), src, dest1); #endif } static void sse_rule_splitlw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest1 = p->vars[insn->dest_args[0]].alloc; int dest2 = p->vars[insn->dest_args[1]].alloc; /* FIXME slow */ orc_sse_emit_psrad (p, 16, dest1); orc_sse_emit_packssdw (p, dest1, dest1); if (dest2 != src) { orc_sse_emit_movdqa (p, src, dest2); } orc_sse_emit_pslld (p, 16, dest2); orc_sse_emit_psrad (p, 16, dest2); orc_sse_emit_packssdw (p, dest2, dest2); } static void sse_rule_splitwb (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest1 = p->vars[insn->dest_args[0]].alloc; int dest2 = p->vars[insn->dest_args[1]].alloc; int tmp = orc_compiler_get_constant (p, 2, 0xff); /* FIXME slow */ orc_sse_emit_psraw (p, 8, dest1); orc_sse_emit_packsswb (p, dest1, dest1); if (dest2 != src) { orc_sse_emit_movdqa (p, src, dest2); } #if 0 orc_sse_emit_psllw (p, 8, dest2); orc_sse_emit_psraw (p, 8, dest2); orc_sse_emit_packsswb (p, dest2, dest2); #else orc_sse_emit_pand (p, tmp, dest2); orc_sse_emit_packuswb (p, dest2, dest2); #endif } static void sse_rule_mergebw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklbw (p, src, dest); } static void sse_rule_mergewl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_punpcklwd (p, src, dest); } static void sse_rule_swapw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swapl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pslld (p, 16, tmp); orc_sse_emit_psrld (p, 16, dest); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swapwl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pslld (p, 16, tmp); orc_sse_emit_psrld (p, 16, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swapq (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_psllq (p, 32, tmp); orc_sse_emit_psrlq (p, 32, dest); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pslld (p, 16, tmp); orc_sse_emit_psrld (p, 16, dest); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psllw (p, 8, tmp); orc_sse_emit_psrlw (p, 8, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_swaplq (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, dest); } #ifndef MMX static void sse_rule_swapw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x02030001, 0x06070405, 0x0a0b0809, 0x0e0f0c0d); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapw (p, user, insn); } } static void sse_rule_swapl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapl (p, user, insn); } } static void sse_rule_swapwl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapl (p, user, insn); } } static void sse_rule_swapq_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_swapq (p, user, insn); } } static void sse_rule_select0lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x05040100, 0x0d0c0908, 0x05040100, 0x0d0c0908); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select0lw (p, user, insn); } } static void sse_rule_select1lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x07060302, 0x0f0e0b0a, 0x07060302, 0x0f0e0b0a); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select1lw (p, user, insn); } } static void sse_rule_select0wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x06040200, 0x0e0c0a08, 0x06040200, 0x0e0c0a08); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select0wb (p, user, insn); } } static void sse_rule_select1wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn) { int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_try_get_constant_long (p, 0x07050301, 0x0f0d0b09, 0x07050301, 0x0f0d0b09); if (tmp != ORC_REG_INVALID) { orc_sse_emit_pshufb (p, tmp, dest); } else { sse_rule_select1wb (p, user, insn); } } #endif /* slow rules */ static void sse_rule_maxuw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pmaxsw (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_minuw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pminsw (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_avgsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 1, 0x80); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pavgb (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_avgsw_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp; tmp = orc_compiler_get_constant (p, 2, 0x8000); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); orc_sse_emit_pavgw (p, src, dest); orc_sse_emit_pxor(p, tmp, src); orc_sse_emit_pxor(p, tmp, dest); } static void sse_rule_maxsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pcmpgtb (p, src, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_minsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pcmpgtb (p, dest, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_maxsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pcmpgtd (p, src, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_minsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pcmpgtd (p, dest, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_maxul_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmpc; tmpc = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pcmpgtd (p, src, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); } static void sse_rule_minul_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmpc; tmpc = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pcmpgtd (p, dest, tmp); orc_sse_emit_pand (p, tmp, dest); orc_sse_emit_pandn (p, src, tmp); orc_sse_emit_por (p, tmp, dest); orc_sse_emit_pxor(p, tmpc, src); orc_sse_emit_pxor(p, tmpc, dest); } static void sse_rule_avgsl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* (a+b+1) >> 1 = (a|b) - ((a^b)>>1) */ orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pxor(p, src, tmp); orc_sse_emit_psrad(p, 1, tmp); orc_sse_emit_por(p, src, dest); orc_sse_emit_psubd(p, tmp, dest); } static void sse_rule_avgul (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); /* (a+b+1) >> 1 = (a|b) - ((a^b)>>1) */ orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_pxor(p, src, tmp); orc_sse_emit_psrld(p, 1, tmp); orc_sse_emit_por(p, src, dest); orc_sse_emit_psubd(p, tmp, dest); } static void sse_rule_addssl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); #if 0 int tmp2 = orc_compiler_get_temp_reg (p); int tmp3 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pand (p, dest, tmp); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_pxor (p, dest, tmp2); orc_sse_emit_psrad (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); orc_sse_emit_psrad (p, 30, tmp); orc_sse_emit_pslld (p, 30, tmp); orc_sse_emit_movdqa (p, tmp, tmp2); orc_sse_emit_pslld (p, 1, tmp2); orc_sse_emit_movdqa (p, tmp, tmp3); orc_sse_emit_pxor (p, tmp2, tmp3); orc_sse_emit_psrad (p, 31, tmp3); orc_sse_emit_psrad (p, 31, tmp2); tmp = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor (p, tmp, tmp2); // clamped value orc_sse_emit_pand (p, tmp3, tmp2); orc_sse_emit_paddd (p, src, dest); orc_sse_emit_pandn (p, dest, tmp3); // tmp is mask: ~0 is for clamping orc_sse_emit_movdqa (p, tmp3, dest); orc_sse_emit_por (p, tmp2, dest); #endif int s = orc_compiler_get_temp_reg (p); int t = orc_compiler_get_temp_reg (p); /* From Tim Terriberry: (slightly faster than above) m=0xFFFFFFFF; s=_a; t=_a; s^=_b; _a+=_b; t^=_a; t^=m; m>>=1; s|=t; t=_b; s>>=31; t>>=31; _a&=s; t^=m; s=~s&t; _a|=s; */ orc_sse_emit_movdqa (p, dest, s); orc_sse_emit_movdqa (p, dest, t); orc_sse_emit_pxor (p, src, s); orc_sse_emit_paddd (p, src, dest); orc_sse_emit_pxor (p, dest, t); tmp = orc_compiler_get_constant (p, 4, 0xffffffff); orc_sse_emit_pxor (p, tmp, t); orc_sse_emit_por (p, t, s); orc_sse_emit_movdqa (p, src, t); orc_sse_emit_psrad (p, 31, s); orc_sse_emit_psrad (p, 31, t); orc_sse_emit_pand (p, s, dest); tmp = orc_compiler_get_constant (p, 4, 0x7fffffff); orc_sse_emit_pxor (p, tmp, t); orc_sse_emit_pandn (p, t, s); orc_sse_emit_por (p, s, dest); } static void sse_rule_subssl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); int tmp3 = orc_compiler_get_temp_reg (p); tmp = orc_compiler_get_temp_constant (p, 4, 0xffffffff); orc_sse_emit_pxor (p, src, tmp); orc_sse_emit_movdqa (p, tmp, tmp2); orc_sse_emit_por (p, dest, tmp); orc_sse_emit_pxor (p, dest, tmp2); orc_sse_emit_psrad (p, 1, tmp2); orc_sse_emit_psubd (p, tmp2, tmp); orc_sse_emit_psrad (p, 30, tmp); orc_sse_emit_pslld (p, 30, tmp); orc_sse_emit_movdqa (p, tmp, tmp2); orc_sse_emit_pslld (p, 1, tmp2); orc_sse_emit_movdqa (p, tmp, tmp3); orc_sse_emit_pxor (p, tmp2, tmp3); orc_sse_emit_psrad (p, 31, tmp3); // tmp3 is mask: ~0 is for clamping orc_sse_emit_psrad (p, 31, tmp2); tmp = orc_compiler_get_constant (p, 4, 0x80000000); orc_sse_emit_pxor (p, tmp, tmp2); // clamped value orc_sse_emit_pand (p, tmp3, tmp2); orc_sse_emit_psubd (p, src, dest); orc_sse_emit_pandn (p, dest, tmp3); orc_sse_emit_movdqa (p, tmp3, dest); orc_sse_emit_por (p, tmp2, dest); } static void sse_rule_addusl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); #if 0 /* an alternate version. slower. */ /* Compute the bit that gets carried from bit 0 to bit 1 */ orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pand (p, dest, tmp); orc_sse_emit_pslld (p, 31, tmp); orc_sse_emit_psrld (p, 31, tmp); /* Add in (src>>1) */ orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_psrld (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); /* Add in (dest>>1) */ orc_sse_emit_movdqa (p, dest, tmp2); orc_sse_emit_psrld (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); /* turn overflow bit into mask */ orc_sse_emit_psrad (p, 31, tmp); /* compute the sum, then or over the mask */ orc_sse_emit_paddd (p, src, dest); orc_sse_emit_por (p, tmp, dest); #endif orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_pand (p, dest, tmp); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_pxor (p, dest, tmp2); orc_sse_emit_psrld (p, 1, tmp2); orc_sse_emit_paddd (p, tmp2, tmp); orc_sse_emit_psrad (p, 31, tmp); orc_sse_emit_paddd (p, src, dest); orc_sse_emit_por (p, tmp, dest); } static void sse_rule_subusl_slow (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmp = orc_compiler_get_temp_reg (p); int tmp2 = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, src, tmp2); orc_sse_emit_psrld (p, 1, tmp2); orc_sse_emit_movdqa (p, dest, tmp); orc_sse_emit_psrld (p, 1, tmp); orc_sse_emit_psubd (p, tmp, tmp2); /* turn overflow bit into mask */ orc_sse_emit_psrad (p, 31, tmp2); /* compute the difference, then and over the mask */ orc_sse_emit_psubd (p, src, dest); orc_sse_emit_pand (p, tmp2, dest); } #ifndef MMX /* float ops */ #define UNARY_F(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_0f (p, insn_name, code, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } #define BINARY_F(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_0f (p, insn_name, code, \ p->vars[insn->src_args[1]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } BINARY_F(addf, "addps", 0x58) BINARY_F(subf, "subps", 0x5c) BINARY_F(mulf, "mulps", 0x59) BINARY_F(divf, "divps", 0x5e) UNARY_F(sqrtf, "sqrtps", 0x51) #define UNARY_D(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_660f (p, insn_name, code, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } #define BINARY_D(opcode,insn_name,code) \ static void \ sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_660f (p, insn_name, code, \ p->vars[insn->src_args[1]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } BINARY_D(addd, "addpd", 0x58) BINARY_D(subd, "subpd", 0x5c) BINARY_D(muld, "mulpd", 0x59) BINARY_D(divd, "divpd", 0x5e) UNARY_D(sqrtd, "sqrtpd", 0x51) static void sse_rule_minf (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_0f (p, "minps", 0x5d, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_0f (p, "minps", 0x5d, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_0f (p, "minps", 0x5d, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_mind (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_660f (p, "minpd", 0x5d, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_660f (p, "minpd", 0x5d, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_660f (p, "minpd", 0x5d, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_maxf (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_0f (p, "maxps", 0x5f, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_0f (p, "maxps", 0x5f, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_0f (p, "maxps", 0x5f, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_maxd (OrcCompiler *p, void *user, OrcInstruction *insn) { if (p->target_flags & ORC_TARGET_FAST_NAN) { orc_sse_emit_660f (p, "maxpd", 0x5f, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); } else { int tmp = orc_compiler_get_temp_reg (p); orc_sse_emit_movdqa (p, p->vars[insn->src_args[1]].alloc, tmp); orc_sse_emit_660f (p, "maxpd", 0x5f, p->vars[insn->dest_args[0]].alloc, tmp); orc_sse_emit_660f (p, "maxpd", 0x5f, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); orc_sse_emit_por (p, tmp, p->vars[insn->dest_args[0]].alloc); } } static void sse_rule_cmpeqf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_0f (p, "cmpeqps", 0xc2, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); *p->codeptr++ = 0x00; } static void sse_rule_cmpeqd (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_660f (p, "cmpeqpd", 0xc2, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); *p->codeptr++ = 0x00; } static void sse_rule_cmpltf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_0f (p, "cmpltps", 0xc2, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); *p->codeptr++ = 0x01; } static void sse_rule_cmpltd (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_660f (p, "cmpltpd", 0xc2, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); *p->codeptr++ = 0x01; } static void sse_rule_cmplef (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_0f (p, "cmpleps", 0xc2, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); *p->codeptr++ = 0x02; } static void sse_rule_cmpled (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_660f (p, "cmplepd", 0xc2, p->vars[insn->src_args[1]].alloc, p->vars[insn->dest_args[0]].alloc); *p->codeptr++ = 0x02; } static void sse_rule_convfl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmpc; int tmp = orc_compiler_get_temp_reg (p); tmpc = orc_compiler_get_temp_constant (p, 4, 0x80000000); orc_sse_emit_movdqa (p, src, tmp); orc_sse_emit_f30f (p, "cvttps2dq", 0x5b, src, dest); orc_sse_emit_psrad (p, 31, tmp); orc_sse_emit_pcmpeqd (p, dest, tmpc); orc_sse_emit_pandn (p, tmpc, tmp); orc_sse_emit_paddd (p, tmp, dest); } static void sse_rule_convdl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; int dest = p->vars[insn->dest_args[0]].alloc; int tmpc; int tmp = orc_compiler_get_temp_reg (p); tmpc = orc_compiler_get_temp_constant (p, 4, 0x80000000); orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,1,3,1), src, tmp); orc_sse_emit_660f (p, "cvttpd2dq", 0xe6, src, dest); orc_sse_emit_psrad (p, 31, tmp); orc_sse_emit_pcmpeqd (p, dest, tmpc); orc_sse_emit_pandn (p, tmpc, tmp); orc_sse_emit_paddd (p, tmp, dest); } static void sse_rule_convlf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_0f (p, "cvtdq2ps", 0x5b, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convld (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_f30f (p, "cvtdq2pd", 0xe6, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convfd (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_0f (p, "cvtps2pd", 0x5a, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } static void sse_rule_convdf (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_sse_emit_660f (p, "cvtpd2ps", 0x5a, p->vars[insn->src_args[0]].alloc, p->vars[insn->dest_args[0]].alloc); } #endif #define UNARY_SSE41(opcode,insn_name) \ static void \ sse_rule_ ## opcode ## _sse41 (OrcCompiler *p, void *user, OrcInstruction *insn) \ { \ orc_sse_emit_ ## insn_name (p, \ p->vars[insn->src_args[0]].alloc, \ p->vars[insn->dest_args[0]].alloc); \ } UNARY_SSE41(convsbw,pmovsxbw); UNARY_SSE41(convswl,pmovsxwd); UNARY_SSE41(convslq,pmovsxdq); UNARY_SSE41(convubw,pmovzxbw); UNARY_SSE41(convuwl,pmovzxwd); UNARY_SSE41(convulq,pmovzxdq); void orc_compiler_sse_register_rules (OrcTarget *target) { OrcRuleSet *rule_set; #define REG(x) \ orc_rule_register (rule_set, #x , sse_rule_ ## x, NULL) /* SSE 2 */ #ifndef MMX rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSE2); #else rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_MMX_MMX); #endif orc_rule_register (rule_set, "loadb", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadw", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadl", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadq", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadoffb", sse_rule_loadoffX, NULL); orc_rule_register (rule_set, "loadoffw", sse_rule_loadoffX, NULL); orc_rule_register (rule_set, "loadoffl", sse_rule_loadoffX, NULL); orc_rule_register (rule_set, "loadupdb", sse_rule_loadupdb, NULL); orc_rule_register (rule_set, "loadupib", sse_rule_loadupib, NULL); orc_rule_register (rule_set, "loadpb", sse_rule_loadpX, (void *)1); orc_rule_register (rule_set, "loadpw", sse_rule_loadpX, (void *)2); orc_rule_register (rule_set, "loadpl", sse_rule_loadpX, (void *)4); orc_rule_register (rule_set, "loadpq", sse_rule_loadpX, (void *)8); orc_rule_register (rule_set, "ldresnearl", sse_rule_ldresnearl, NULL); orc_rule_register (rule_set, "ldreslinl", sse_rule_ldreslinl, NULL); orc_rule_register (rule_set, "storeb", sse_rule_storeX, NULL); orc_rule_register (rule_set, "storew", sse_rule_storeX, NULL); orc_rule_register (rule_set, "storel", sse_rule_storeX, NULL); orc_rule_register (rule_set, "storeq", sse_rule_storeX, NULL); REG(addb); REG(addssb); REG(addusb); REG(andb); REG(andnb); REG(avgub); REG(cmpeqb); REG(cmpgtsb); REG(maxub); REG(minub); REG(orb); REG(subb); REG(subssb); REG(subusb); REG(xorb); REG(addw); REG(addssw); REG(addusw); REG(andw); REG(andnw); REG(avguw); REG(cmpeqw); REG(cmpgtsw); REG(maxsw); REG(minsw); REG(mullw); REG(mulhsw); REG(mulhuw); REG(orw); REG(subw); REG(subssw); REG(subusw); REG(xorw); REG(addl); REG(andl); REG(andnl); REG(cmpeql); REG(cmpgtsl); REG(orl); REG(subl); REG(xorl); REG(andq); REG(andnq); REG(orq); REG(xorq); REG(addq); REG(subq); REG(select0lw); REG(select1lw); REG(select0wb); REG(select1wb); REG(mergebw); REG(mergewl); orc_rule_register (rule_set, "copyb", sse_rule_copyx, NULL); orc_rule_register (rule_set, "copyw", sse_rule_copyx, NULL); orc_rule_register (rule_set, "copyl", sse_rule_copyx, NULL); orc_rule_register (rule_set, "copyq", sse_rule_copyx, NULL); orc_rule_register (rule_set, "shlw", sse_rule_shift, (void *)0); orc_rule_register (rule_set, "shruw", sse_rule_shift, (void *)1); orc_rule_register (rule_set, "shrsw", sse_rule_shift, (void *)2); orc_rule_register (rule_set, "shll", sse_rule_shift, (void *)3); orc_rule_register (rule_set, "shrul", sse_rule_shift, (void *)4); orc_rule_register (rule_set, "shrsl", sse_rule_shift, (void *)5); orc_rule_register (rule_set, "shlq", sse_rule_shift, (void *)6); orc_rule_register (rule_set, "shruq", sse_rule_shift, (void *)7); orc_rule_register (rule_set, "shrsq", sse_rule_shrsq, NULL); orc_rule_register (rule_set, "convsbw", sse_rule_convsbw, NULL); orc_rule_register (rule_set, "convubw", sse_rule_convubw, NULL); orc_rule_register (rule_set, "convssswb", sse_rule_convssswb, NULL); orc_rule_register (rule_set, "convsuswb", sse_rule_convsuswb, NULL); orc_rule_register (rule_set, "convuuswb", sse_rule_convuuswb, NULL); orc_rule_register (rule_set, "convwb", sse_rule_convwb, NULL); orc_rule_register (rule_set, "convswl", sse_rule_convswl, NULL); orc_rule_register (rule_set, "convuwl", sse_rule_convuwl, NULL); orc_rule_register (rule_set, "convssslw", sse_rule_convssslw, NULL); orc_rule_register (rule_set, "convql", sse_rule_convql, NULL); orc_rule_register (rule_set, "convslq", sse_rule_convslq, NULL); orc_rule_register (rule_set, "convulq", sse_rule_convulq, NULL); //orc_rule_register (rule_set, "convsssql", sse_rule_convsssql, NULL); orc_rule_register (rule_set, "mulsbw", sse_rule_mulsbw, NULL); orc_rule_register (rule_set, "mulubw", sse_rule_mulubw, NULL); orc_rule_register (rule_set, "mulswl", sse_rule_mulswl, NULL); orc_rule_register (rule_set, "muluwl", sse_rule_muluwl, NULL); orc_rule_register (rule_set, "accw", sse_rule_accw, NULL); orc_rule_register (rule_set, "accl", sse_rule_accl, NULL); orc_rule_register (rule_set, "accsadubl", sse_rule_accsadubl, NULL); #ifndef MMX orc_rule_register (rule_set, "addf", sse_rule_addf, NULL); orc_rule_register (rule_set, "subf", sse_rule_subf, NULL); orc_rule_register (rule_set, "mulf", sse_rule_mulf, NULL); orc_rule_register (rule_set, "divf", sse_rule_divf, NULL); orc_rule_register (rule_set, "minf", sse_rule_minf, NULL); orc_rule_register (rule_set, "maxf", sse_rule_maxf, NULL); orc_rule_register (rule_set, "sqrtf", sse_rule_sqrtf, NULL); orc_rule_register (rule_set, "cmpeqf", sse_rule_cmpeqf, NULL); orc_rule_register (rule_set, "cmpltf", sse_rule_cmpltf, NULL); orc_rule_register (rule_set, "cmplef", sse_rule_cmplef, NULL); orc_rule_register (rule_set, "convfl", sse_rule_convfl, NULL); orc_rule_register (rule_set, "convlf", sse_rule_convlf, NULL); orc_rule_register (rule_set, "addd", sse_rule_addd, NULL); orc_rule_register (rule_set, "subd", sse_rule_subd, NULL); orc_rule_register (rule_set, "muld", sse_rule_muld, NULL); orc_rule_register (rule_set, "divd", sse_rule_divd, NULL); orc_rule_register (rule_set, "mind", sse_rule_mind, NULL); orc_rule_register (rule_set, "maxd", sse_rule_maxd, NULL); orc_rule_register (rule_set, "sqrtd", sse_rule_sqrtd, NULL); orc_rule_register (rule_set, "cmpeqd", sse_rule_cmpeqd, NULL); orc_rule_register (rule_set, "cmpltd", sse_rule_cmpltd, NULL); orc_rule_register (rule_set, "cmpled", sse_rule_cmpled, NULL); orc_rule_register (rule_set, "convdl", sse_rule_convdl, NULL); orc_rule_register (rule_set, "convld", sse_rule_convld, NULL); orc_rule_register (rule_set, "convfd", sse_rule_convfd, NULL); orc_rule_register (rule_set, "convdf", sse_rule_convdf, NULL); #endif /* slow rules */ orc_rule_register (rule_set, "maxuw", sse_rule_maxuw_slow, NULL); orc_rule_register (rule_set, "minuw", sse_rule_minuw_slow, NULL); orc_rule_register (rule_set, "avgsb", sse_rule_avgsb_slow, NULL); orc_rule_register (rule_set, "avgsw", sse_rule_avgsw_slow, NULL); orc_rule_register (rule_set, "maxsb", sse_rule_maxsb_slow, NULL); orc_rule_register (rule_set, "minsb", sse_rule_minsb_slow, NULL); orc_rule_register (rule_set, "maxsl", sse_rule_maxsl_slow, NULL); orc_rule_register (rule_set, "minsl", sse_rule_minsl_slow, NULL); orc_rule_register (rule_set, "maxul", sse_rule_maxul_slow, NULL); orc_rule_register (rule_set, "minul", sse_rule_minul_slow, NULL); orc_rule_register (rule_set, "convlw", sse_rule_convlw, NULL); orc_rule_register (rule_set, "signw", sse_rule_signw_slow, NULL); orc_rule_register (rule_set, "absb", sse_rule_absb_slow, NULL); orc_rule_register (rule_set, "absw", sse_rule_absw_slow, NULL); orc_rule_register (rule_set, "absl", sse_rule_absl_slow, NULL); orc_rule_register (rule_set, "swapw", sse_rule_swapw, NULL); orc_rule_register (rule_set, "swapl", sse_rule_swapl, NULL); orc_rule_register (rule_set, "swapwl", sse_rule_swapwl, NULL); orc_rule_register (rule_set, "swapq", sse_rule_swapq, NULL); orc_rule_register (rule_set, "swaplq", sse_rule_swaplq, NULL); orc_rule_register (rule_set, "splitql", sse_rule_splitql, NULL); orc_rule_register (rule_set, "splitlw", sse_rule_splitlw, NULL); orc_rule_register (rule_set, "splitwb", sse_rule_splitwb, NULL); orc_rule_register (rule_set, "avgsl", sse_rule_avgsl, NULL); orc_rule_register (rule_set, "avgul", sse_rule_avgul, NULL); orc_rule_register (rule_set, "shlb", sse_rule_shlb, NULL); orc_rule_register (rule_set, "shrsb", sse_rule_shrsb, NULL); orc_rule_register (rule_set, "shrub", sse_rule_shrub, NULL); orc_rule_register (rule_set, "mulll", sse_rule_mulll_slow, NULL); orc_rule_register (rule_set, "mulhsl", sse_rule_mulhsl_slow, NULL); #ifndef MMX orc_rule_register (rule_set, "mulhul", sse_rule_mulhul, NULL); #endif orc_rule_register (rule_set, "mullb", sse_rule_mullb, NULL); orc_rule_register (rule_set, "mulhsb", sse_rule_mulhsb, NULL); orc_rule_register (rule_set, "mulhub", sse_rule_mulhub, NULL); orc_rule_register (rule_set, "addssl", sse_rule_addssl_slow, NULL); orc_rule_register (rule_set, "subssl", sse_rule_subssl_slow, NULL); orc_rule_register (rule_set, "addusl", sse_rule_addusl_slow, NULL); orc_rule_register (rule_set, "subusl", sse_rule_subusl_slow, NULL); orc_rule_register (rule_set, "convhwb", sse_rule_convhwb, NULL); orc_rule_register (rule_set, "convhlw", sse_rule_convhlw, NULL); orc_rule_register (rule_set, "splatw3q", sse_rule_splatw3q, NULL); orc_rule_register (rule_set, "splatbw", sse_rule_splatbw, NULL); orc_rule_register (rule_set, "splatbl", sse_rule_splatbl, NULL); orc_rule_register (rule_set, "div255w", sse_rule_div255w, NULL); orc_rule_register (rule_set, "divluw", sse_rule_divluw, NULL); /* SSE 3 -- no rules */ /* SSSE 3 */ rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSSE3); orc_rule_register (rule_set, "signb", sse_rule_signX_ssse3, (void *)0); orc_rule_register (rule_set, "signw", sse_rule_signX_ssse3, (void *)1); orc_rule_register (rule_set, "signl", sse_rule_signX_ssse3, (void *)2); REG(absb); REG(absw); REG(absl); #ifndef MMX orc_rule_register (rule_set, "swapw", sse_rule_swapw_ssse3, NULL); orc_rule_register (rule_set, "swapl", sse_rule_swapl_ssse3, NULL); orc_rule_register (rule_set, "swapwl", sse_rule_swapwl_ssse3, NULL); orc_rule_register (rule_set, "swapq", sse_rule_swapq_ssse3, NULL); orc_rule_register (rule_set, "select0lw", sse_rule_select0lw_ssse3, NULL); orc_rule_register (rule_set, "select1lw", sse_rule_select1lw_ssse3, NULL); orc_rule_register (rule_set, "select0wb", sse_rule_select0wb_ssse3, NULL); orc_rule_register (rule_set, "select1wb", sse_rule_select1wb_ssse3, NULL); #endif /* SSE 4.1 */ rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSE4_1); REG(maxsb); REG(minsb); REG(maxuw); REG(minuw); REG(maxsl); REG(maxul); REG(minsl); REG(minul); REG(mulll); orc_rule_register (rule_set, "convsbw", sse_rule_convsbw_sse41, NULL); orc_rule_register (rule_set, "convswl", sse_rule_convswl_sse41, NULL); orc_rule_register (rule_set, "convslq", sse_rule_convslq_sse41, NULL); orc_rule_register (rule_set, "convubw", sse_rule_convubw_sse41, NULL); orc_rule_register (rule_set, "convuwl", sse_rule_convuwl_sse41, NULL); orc_rule_register (rule_set, "convulq", sse_rule_convulq_sse41, NULL); orc_rule_register (rule_set, "convsuslw", sse_rule_convsuslw, NULL); #ifndef MMX orc_rule_register (rule_set, "mulhsl", sse_rule_mulhsl, NULL); #endif REG(cmpeqq); /* SSE 4.2 -- no rules */ rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, ORC_TARGET_SSE_SSE4_2); REG(cmpgtsq); /* SSE 4a -- no rules */ }